# Clear environment of variables and functions
rm(list = ls(all = TRUE))
# Clear environmet of packages
if(is.null(sessionInfo()$otherPkgs) == FALSE)lapply(paste("package:", names(sessionInfo()$otherPkgs), sep=""), detach, character.only = TRUE, unload = TRUE)# Load libraries
library(tidyverse)
library(corrplot)
library(GGally) # For ggpairs, a detailed correlation graphic
library(gridExtra) # For grouping graphs
library(stringr) # Manipulate strings
library(lubridate) # Time Series with respect to Weeks
library(kableExtra) # Prettier tables
library(janitor) # Prettier cross-tabs
library(MultinomialCI) # To calculate multinomial confidence intervals
library(here)
library(dichromat)
library(leaflet)
library(DT)
library(RColorBrewer)
library(psych)
library(plotly)
options(knitr.table.format = "html") # setting for kable table to use html styles# Modify columns types
# condition: 1 to 5
# Grade: 1 to 13
house_data <- dat %>%
drop_na() %>%
filter(bedrooms <= 11) %>%
mutate(renovated = ifelse(yr_renovated == 0, 0, 1)) %>%
mutate(zipcode = as.factor(zipcode),
year_sold = year(date),
month_sold = month(date)) %>%
mutate(year_sold = as.factor(year_sold),
month_sold = as.factor(month_sold),
condition = as.factor(condition),
grade = as.factor(grade),
bedrooms = as.factor(bedrooms),
bathrooms = as.factor(bathrooms),
floors = as.factor(floors),
waterfront = as.factor(waterfront),
renovated = as.factor(renovated)) %>%
mutate(price_per_sqft = price/sqft_living) %>%
select(date, month_sold, year_sold, price, price_per_sqft, bedrooms, bathrooms, sqft_living, sqft_lot, floors, waterfront, condition, grade, yr_built, zipcode, lat, long, renovated) %>%
rename(date_sold = date,
selling_price = price)
summary(house_data)## date_sold month_sold year_sold
## Min. :2014-05-02 00:00:00 5 :2414 2014:14632
## 1st Qu.:2014-07-22 00:00:00 4 :2231 2015: 6980
## Median :2014-10-16 00:00:00 7 :2211
## Mean :2014-10-29 04:46:26 6 :2179
## 3rd Qu.:2015-02-17 00:00:00 8 :1940
## Max. :2015-05-27 00:00:00 10 :1878
## (Other):8759
## selling_price price_per_sqft bedrooms bathrooms
## Min. : 75000 Min. : 87.59 3 :9824 2.5 :5380
## 1st Qu.: 321838 1st Qu.:182.29 4 :6882 1 :3852
## Median : 450000 Median :244.63 2 :2760 1.75 :3047
## Mean : 540084 Mean :264.15 5 :1601 2.25 :2047
## 3rd Qu.: 645000 3rd Qu.:318.32 6 : 272 2 :1930
## Max. :7700000 Max. :810.14 1 : 199 1.5 :1446
## (Other): 74 (Other):3910
## sqft_living sqft_lot floors waterfront condition
## Min. : 290 Min. : 520 1 :10679 0:21449 1: 30
## 1st Qu.: 1426 1st Qu.: 5040 1.5: 1910 1: 163 2: 172
## Median : 1910 Median : 7619 2 : 8241 3:14031
## Mean : 2080 Mean : 15107 2.5: 161 4: 5679
## 3rd Qu.: 2550 3rd Qu.: 10688 3 : 613 5: 1700
## Max. :13540 Max. :1651359 3.5: 8
##
## grade yr_built zipcode lat
## 7 :8980 Min. :1900 98103 : 601 Min. :47.16
## 8 :6068 1st Qu.:1951 98038 : 590 1st Qu.:47.47
## 9 :2615 Median :1975 98115 : 583 Median :47.57
## 6 :2038 Mean :1971 98052 : 574 Mean :47.56
## 10 :1134 3rd Qu.:1997 98117 : 553 3rd Qu.:47.68
## 11 : 399 Max. :2015 98042 : 548 Max. :47.78
## (Other): 378 (Other):18163
## long renovated
## Min. :-122.5 0:20698
## 1st Qu.:-122.3 1: 914
## Median :-122.2
## Mean :-122.2
## 3rd Qu.:-122.1
## Max. :-121.3
##
.
+ Price ranges from $75,000 to $7,700,000, Median at $450,000 and Mean at $540,088. Slightly skewed towards the right
+ Maximum houses are from 98103 zipcode, followed by 98038 and 98115
+ Most houses are of grade “7” and condition “4” + Only 914 houses are renovated and 20,699 are not renovated + Data consists of 21613 observations and 15 variables. + We have data of house sales from May 2014 to May 2015
# Function to get the relevant grouped data frames
# df -> dataframe
# groupVariable -> the group by column
# output -> the resulting data frame
getGroupedUnivariateData <- function(df, groupVariable) {
# group_by() grouped variable
# summarise() reduces variable to descriptive stat
# count is the new variable name, n() is a counting function
# percent is a new variable, sum() and nrow() are functions
# tot_selling_price -> total selling price per grouped variable
# order_by() tot_selling_price
result <- df %>%
group_by_(groupVariable) %>%
summarise(count = n(),
percent = (sum(count) / nrow(df)) * 100,
tot_revenue = sum(selling_price),
avg_revenue = (mean(selling_price))) %>%
arrange(desc(tot_revenue)) %>%
as.data.frame()
return(result)
}# price and price per sqft by zipcode
house_data_month <- getGroupedUnivariateData(house_data, "month_sold")
house_data_month %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
row_spec(1, background = "#e5f5e0")| month_sold | count | percent | tot_revenue | avg_revenue |
|---|---|---|---|---|
| 5 | 2414 | 11.169721 | 1329555849 | 550768.8 |
| 4 | 2231 | 10.322969 | 1253460076 | 561837.8 |
| 6 | 2179 | 10.082362 | 1215804795 | 557964.6 |
| 7 | 2211 | 10.230427 | 1204527958 | 544788.8 |
| 8 | 1940 | 8.976494 | 1040703837 | 536445.3 |
| 3 | 1875 | 8.675736 | 1019957226 | 543977.2 |
| 10 | 1878 | 8.689617 | 1012292653 | 539027.0 |
| 9 | 1774 | 8.208403 | 938896280 | 529253.8 |
| 12 | 1471 | 6.806404 | 771483406 | 524461.9 |
| 11 | 1411 | 6.528780 | 736486984 | 521961.0 |
| 2 | 1250 | 5.783824 | 634814214 | 507851.4 |
| 1 | 978 | 4.525264 | 514301730 | 525870.9 |
# price and price per sqft by zipcode
house_data_bedrooms <- getGroupedUnivariateData(house_data, "bedrooms")
house_data_bedrooms %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
row_spec(1, background = "#e5f5e0")| bedrooms | count | percent | tot_revenue | avg_revenue |
|---|---|---|---|---|
| 3 | 9824 | 45.4562280 | 4580263939 | 466232.1 |
| 4 | 6882 | 31.8434203 | 4372957028 | 635419.5 |
| 5 | 1601 | 7.4079215 | 1259346326 | 786599.8 |
| 2 | 2760 | 12.7706830 | 1107788602 | 401372.7 |
| 6 | 272 | 1.2585601 | 224541613 | 825520.6 |
| 1 | 199 | 0.9207847 | 63210934 | 317642.9 |
| 7 | 38 | 0.1758282 | 36145017 | 951184.7 |
| 8 | 13 | 0.0601518 | 14366000 | 1105076.9 |
| 9 | 6 | 0.0277624 | 5363999 | 893999.8 |
| 0 | 13 | 0.0601518 | 5323550 | 409503.8 |
| 10 | 3 | 0.0138812 | 2458000 | 819333.3 |
| 11 | 1 | 0.0046271 | 520000 | 520000.0 |
# price and price per sqft by zipcode
house_data_floors <- getGroupedUnivariateData(house_data, "floors")
house_data_floors %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
row_spec(1, background = "#e5f5e0")| floors | count | percent | tot_revenue | avg_revenue |
|---|---|---|---|---|
| 2 | 8241 | 38.1315936 | 5347512077 | 648891.2 |
| 1 | 10679 | 49.4123635 | 4721849156 | 442162.1 |
| 1.5 | 1910 | 8.8376828 | 1067653028 | 558980.6 |
| 3 | 613 | 2.8363872 | 357088462 | 582526.0 |
| 2.5 | 161 | 0.7449565 | 170715785 | 1060346.5 |
| 3.5 | 8 | 0.0370165 | 7466500 | 933312.5 |
# price and price per sqft by zipcode
house_data_waterfront <- getGroupedUnivariateData(house_data, "waterfront")
house_data_waterfront %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
row_spec(1, background = "#e5f5e0")| waterfront | count | percent | tot_revenue | avg_revenue |
|---|---|---|---|---|
| 0 | 21449 | 99.2457894 | 11401399216 | 531558.5 |
| 1 | 163 | 0.7542106 | 270885792 | 1661876.0 |
# price and price per sqft by zipcode
house_data_zipcode <- getGroupedUnivariateData(house_data, "zipcode")
house_data_zipcode %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
row_spec(1, background = "#e5f5e0")| zipcode | count | percent | tot_revenue | avg_revenue |
|---|---|---|---|---|
| 98004 | 317 | 1.4667777 | 429828885 | 1355927.1 |
| 98006 | 498 | 2.3042754 | 428123020 | 859684.8 |
| 98052 | 574 | 2.6559319 | 370362856 | 645231.5 |
| 98115 | 583 | 2.6975754 | 361402019 | 619900.5 |
| 98103 | 601 | 2.7808625 | 351481365 | 584827.6 |
| 98033 | 432 | 1.9988895 | 347206834 | 803719.5 |
| 98040 | 282 | 1.3048306 | 336772866 | 1194230.0 |
| 98117 | 553 | 2.5587636 | 318967639 | 576795.0 |
| 98074 | 441 | 2.0405330 | 302352147 | 685605.8 |
| 98112 | 269 | 1.2446789 | 294689323 | 1095499.3 |
| 98034 | 545 | 2.5217472 | 284300808 | 521652.9 |
| 98075 | 359 | 1.6611142 | 283817019 | 790576.7 |
| 98053 | 405 | 1.8739589 | 274656039 | 678163.1 |
| 98027 | 412 | 1.9063483 | 254200124 | 616990.6 |
| 98199 | 317 | 1.4667777 | 251007196 | 791820.8 |
| 98059 | 468 | 2.1654636 | 230982585 | 493552.5 |
| 98038 | 590 | 2.7299648 | 216451884 | 366867.6 |
| 98118 | 508 | 2.3505460 | 212159816 | 417637.4 |
| 98116 | 330 | 1.5269295 | 204149276 | 618634.2 |
| 98144 | 343 | 1.5870813 | 203929844 | 594547.7 |
| 98105 | 229 | 1.0595965 | 197586978 | 862825.2 |
| 98029 | 321 | 1.4852860 | 196661809 | 612653.6 |
| 98125 | 410 | 1.8970942 | 192476866 | 469455.8 |
| 98133 | 494 | 2.2857672 | 191176714 | 386997.4 |
| 98155 | 446 | 2.0636683 | 188981660 | 423725.7 |
| 98122 | 290 | 1.3418471 | 183964452 | 634360.2 |
| 98008 | 283 | 1.3094577 | 182678588 | 645507.4 |
| 98177 | 255 | 1.1799001 | 172427275 | 676185.4 |
| 98056 | 406 | 1.8785860 | 170881563 | 420890.5 |
| 98042 | 548 | 2.5356284 | 170774394 | 311632.1 |
| 98065 | 310 | 1.4343883 | 163667973 | 527961.2 |
| 98058 | 455 | 2.1053119 | 160891929 | 353608.6 |
| 98119 | 184 | 0.8513789 | 156298435 | 849448.0 |
| 98072 | 273 | 1.2631871 | 155598661 | 569958.5 |
| 98107 | 266 | 1.2307977 | 154028209 | 579053.4 |
| 98126 | 354 | 1.6379789 | 150346050 | 424706.4 |
| 98136 | 263 | 1.2169165 | 145094121 | 551688.7 |
| 98023 | 499 | 2.3089025 | 143079663 | 286732.8 |
| 98005 | 168 | 0.7773459 | 136107699 | 810164.9 |
| 98077 | 198 | 0.9161577 | 135189426 | 682774.9 |
| 98028 | 283 | 1.3094577 | 130881850 | 462480.0 |
| 98166 | 254 | 1.1752730 | 117914887 | 464231.8 |
| 98092 | 351 | 1.6240977 | 117557293 | 334921.1 |
| 98039 | 50 | 0.2313530 | 108030330 | 2160606.6 |
| 98106 | 335 | 1.5500648 | 107059767 | 319581.4 |
| 98146 | 288 | 1.3325930 | 103531173 | 359483.2 |
| 98001 | 362 | 1.6749954 | 101651298 | 280804.7 |
| 98045 | 221 | 1.0225800 | 97123109 | 439471.1 |
| 98109 | 109 | 0.5043494 | 95878975 | 879623.6 |
| 98011 | 195 | 0.9022765 | 95618536 | 490351.5 |
| 98102 | 105 | 0.4858412 | 94632118 | 901258.3 |
| 98007 | 141 | 0.6524153 | 87011817 | 617105.1 |
| 98198 | 280 | 1.2955765 | 84806087 | 302878.9 |
| 98003 | 280 | 1.2955765 | 82351158 | 294111.3 |
| 98031 | 274 | 1.2678142 | 82347930 | 300539.9 |
| 98055 | 268 | 1.2400518 | 81542245 | 304262.1 |
| 98178 | 262 | 1.2122895 | 81380542 | 310612.8 |
| 98019 | 190 | 0.8791412 | 80709862 | 424788.7 |
| 98030 | 256 | 1.1845271 | 75824123 | 296188.0 |
| 98022 | 234 | 1.0827318 | 73875977 | 315709.3 |
| 98108 | 186 | 0.8606330 | 66156204 | 355678.5 |
| 98168 | 269 | 1.2446789 | 64648332 | 240328.4 |
| 98070 | 118 | 0.5459930 | 57522596 | 487479.6 |
| 98014 | 124 | 0.5737553 | 56496522 | 455617.1 |
| 98024 | 81 | 0.3747918 | 47022670 | 580526.8 |
| 98002 | 199 | 0.9207847 | 46622523 | 234284.0 |
| 98010 | 100 | 0.4627059 | 42366599 | 423666.0 |
| 98188 | 136 | 0.6292800 | 39314655 | 289078.3 |
| 98032 | 125 | 0.5783824 | 31412030 | 251296.2 |
| 98148 | 57 | 0.2637424 | 16239790 | 284908.6 |
# price and price per sqft by zipcode
house_data_renovated <- getGroupedUnivariateData(house_data, "renovated")
house_data_renovated %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
row_spec(1, background = "#e5f5e0")| renovated | count | percent | tot_revenue | avg_revenue |
|---|---|---|---|---|
| 0 | 20698 | 95.770868 | 10977298575 | 530355.5 |
| 1 | 914 | 4.229132 | 694986433 | 760379.0 |
# Code histograms using grid.arrange so can see all quant variables together group by Ship.Mode
grid.arrange(
house_data_bedrooms %>%
ggplot(aes(x = reorder(bedrooms, -count), y = count)) +
xlab("bedrooms") +
theme_classic() +
geom_bar(stat = "identity", fill = "darkolivegreen3"),
house_data_floors %>%
ggplot(aes(x = reorder(floors, -count), y = count)) +
xlab("floors") +
theme_classic() +
geom_bar(stat = "identity", fill = "darkolivegreen3"),
house_data_waterfront %>%
ggplot(aes(x = reorder(waterfront, -count), y = count)) +
xlab("waterfront") +
theme_classic() +
geom_bar(stat = "identity", fill = "darkolivegreen3"),
house_data_zipcode %>%
ggplot(aes(x = reorder(zipcode, -count), y = count)) +
xlab("zipcode") +
theme_classic() +
geom_bar(stat = "identity", fill = "darkolivegreen3"),
house_data_renovated %>%
ggplot(aes(x = reorder(renovated, -count), y = count)) +
xlab("renovated") +
theme_classic() +
geom_bar(stat = "identity", fill = "darkolivegreen3"),
ncol = 2
) ## date_sold month_sold year_sold
## Min. :2014-05-02 00:00:00 5 :2414 2014:14632
## 1st Qu.:2014-07-22 00:00:00 4 :2231 2015: 6980
## Median :2014-10-16 00:00:00 7 :2211
## Mean :2014-10-29 04:46:26 6 :2179
## 3rd Qu.:2015-02-17 00:00:00 8 :1940
## Max. :2015-05-27 00:00:00 10 :1878
## (Other):8759
## selling_price price_per_sqft bedrooms bathrooms
## Min. : 75000 Min. : 87.59 3 :9824 2.5 :5380
## 1st Qu.: 321838 1st Qu.:182.29 4 :6882 1 :3852
## Median : 450000 Median :244.63 2 :2760 1.75 :3047
## Mean : 540084 Mean :264.15 5 :1601 2.25 :2047
## 3rd Qu.: 645000 3rd Qu.:318.32 6 : 272 2 :1930
## Max. :7700000 Max. :810.14 1 : 199 1.5 :1446
## (Other): 74 (Other):3910
## sqft_living sqft_lot floors waterfront condition
## Min. : 290 Min. : 520 1 :10679 0:21449 1: 30
## 1st Qu.: 1426 1st Qu.: 5040 1.5: 1910 1: 163 2: 172
## Median : 1910 Median : 7619 2 : 8241 3:14031
## Mean : 2080 Mean : 15107 2.5: 161 4: 5679
## 3rd Qu.: 2550 3rd Qu.: 10688 3 : 613 5: 1700
## Max. :13540 Max. :1651359 3.5: 8
##
## grade yr_built zipcode lat
## 7 :8980 Min. :1900 98103 : 601 Min. :47.16
## 8 :6068 1st Qu.:1951 98038 : 590 1st Qu.:47.47
## 9 :2615 Median :1975 98115 : 583 Median :47.57
## 6 :2038 Mean :1971 98052 : 574 Mean :47.56
## 10 :1134 3rd Qu.:1997 98117 : 553 3rd Qu.:47.68
## 11 : 399 Max. :2015 98042 : 548 Max. :47.78
## (Other): 378 (Other):18163
## long renovated
## Min. :-122.5 0:20698
## 1st Qu.:-122.3 1: 914
## Median :-122.2
## Mean :-122.2
## 3rd Qu.:-122.1
## Max. :-121.3
##
# Code histograms using ggplot() so we can see all quant variables
grid.arrange(
house_data %>%
ggplot(aes(selling_price/1000)) +
geom_histogram(fill = "darkolivegreen3") +
xlab("selling price (in 1000 $)")+
theme_classic(),
house_data %>%
ggplot(aes(price_per_sqft)) +
geom_histogram(fill = "darkolivegreen3") +
xlab("price per sqft (in $)") +
theme_classic(),
house_data %>%
ggplot(aes(sqft_living)) +
geom_histogram(fill = "darkolivegreen3") +
xlab("sqft of home")+
theme_classic(),
house_data %>%
ggplot(aes(lat)) +
geom_histogram(fill = "darkolivegreen3") +
xlab("latitude") +
theme_classic(),
house_data %>%
ggplot(aes(long)) +
geom_histogram(fill = "darkolivegreen3") +
xlab("longitude")+
theme_classic()
)# get box plot of all the quanitative variables
par(mfrow = c(2, 2))
boxplot(selling_price/1000, main = "Selling Price / 1000")
boxplot(price_per_sqft, main = "price per sqft")
boxplot(sqft_living, main = "sqft living")# Make cross-tabs between two categorical variables using janitor package
# cross-tab of bedrooms and floors
house_data %>%
tabyl(bedrooms, floors) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages(denominator = "all") %>%
adorn_pct_formatting(digits = 2) %>%
adorn_title() %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
row_spec(1, background = "#e5f5e0")| floors | |||||||
|---|---|---|---|---|---|---|---|
| bedrooms | 1 | 1.5 | 2 | 2.5 | 3 | 3.5 | Total |
| 0 | 0.02% | 0.00% | 0.03% | 0.00% | 0.01% | 0.00% | 0.06% |
| 1 | 0.75% | 0.10% | 0.06% | 0.00% | 0.01% | 0.00% | 0.92% |
| 2 | 9.03% | 0.84% | 2.30% | 0.02% | 0.57% | 0.01% | 12.77% |
| 3 | 25.24% | 3.64% | 14.43% | 0.26% | 1.87% | 0.02% | 45.46% |
| 4 | 11.03% | 3.23% | 17.04% | 0.27% | 0.28% | 0.00% | 31.84% |
| 5 | 2.80% | 0.86% | 3.59% | 0.11% | 0.06% | 0.00% | 7.41% |
| 6 | 0.48% | 0.14% | 0.55% | 0.06% | 0.02% | 0.00% | 1.26% |
| 7 | 0.04% | 0.03% | 0.09% | 0.01% | 0.00% | 0.00% | 0.18% |
| 8 | 0.02% | 0.00% | 0.03% | 0.00% | 0.00% | 0.00% | 0.06% |
| 9 | 0.00% | 0.00% | 0.02% | 0.01% | 0.00% | 0.00% | 0.03% |
| 10 | 0.00% | 0.00% | 0.01% | 0.00% | 0.00% | 0.00% | 0.01% |
| 11 | 0.00% | 0.00% | 0.00% | 0.00% | 0.00% | 0.00% | 0.00% |
| Total | 49.41% | 8.84% | 38.13% | 0.74% | 2.84% | 0.04% | 100.00% |
# Make cross-tabs between two categorical variables using janitor package
# cross-tab of bedrooms and floors
house_data %>%
tabyl(bedrooms, month_sold) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages(denominator = "all") %>%
adorn_pct_formatting(digits = 2)%>%
adorn_title() %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
row_spec(1, background = "#e5f5e0")| month_sold | |||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| bedrooms | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | Total |
| 0 | 0.00% | 0.01% | 0.00% | 0.01% | 0.00% | 0.01% | 0.00% | 0.00% | 0.01% | 0.01% | 0.00% | 0.00% | 0.06% |
| 1 | 0.04% | 0.05% | 0.05% | 0.10% | 0.11% | 0.10% | 0.06% | 0.09% | 0.10% | 0.10% | 0.07% | 0.06% | 0.92% |
| 2 | 0.56% | 0.73% | 1.15% | 1.30% | 1.56% | 1.17% | 1.21% | 1.14% | 0.99% | 1.14% | 0.93% | 0.88% | 12.77% |
| 3 | 2.03% | 2.80% | 4.03% | 4.68% | 4.95% | 4.48% | 4.68% | 4.19% | 3.76% | 3.88% | 2.93% | 3.04% | 45.46% |
| 4 | 1.50% | 1.68% | 2.65% | 3.33% | 3.50% | 3.35% | 3.41% | 2.79% | 2.65% | 2.79% | 2.02% | 2.18% | 31.84% |
| 5 | 0.29% | 0.43% | 0.64% | 0.79% | 0.87% | 0.81% | 0.72% | 0.64% | 0.57% | 0.66% | 0.48% | 0.52% | 7.41% |
| 6 | 0.09% | 0.09% | 0.13% | 0.11% | 0.12% | 0.14% | 0.13% | 0.09% | 0.10% | 0.06% | 0.07% | 0.12% | 1.26% |
| 7 | 0.01% | 0.00% | 0.01% | 0.01% | 0.04% | 0.02% | 0.01% | 0.01% | 0.02% | 0.03% | 0.01% | 0.00% | 0.18% |
| 8 | 0.00% | 0.00% | 0.00% | 0.00% | 0.00% | 0.00% | 0.01% | 0.00% | 0.00% | 0.01% | 0.00% | 0.01% | 0.06% |
| 9 | 0.00% | 0.00% | 0.01% | 0.00% | 0.00% | 0.00% | 0.00% | 0.01% | 0.00% | 0.00% | 0.00% | 0.00% | 0.03% |
| 10 | 0.00% | 0.00% | 0.00% | 0.00% | 0.00% | 0.00% | 0.00% | 0.00% | 0.00% | 0.00% | 0.00% | 0.00% | 0.01% |
| 11 | 0.00% | 0.00% | 0.00% | 0.00% | 0.00% | 0.00% | 0.00% | 0.00% | 0.00% | 0.00% | 0.00% | 0.00% | 0.00% |
| Total | 4.53% | 5.78% | 8.68% | 10.32% | 11.17% | 10.08% | 10.23% | 8.98% | 8.21% | 8.69% | 6.53% | 6.81% | 100.00% |
# cross-tab of bedrooms and waterfront
house_data %>%
tabyl(bedrooms, waterfront) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages(denominator = "all") %>%
adorn_pct_formatting(digits = 2)%>%
adorn_title() %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
row_spec(1, background = "#e5f5e0")| waterfront | |||
|---|---|---|---|
| bedrooms | 0 | 1 | Total |
| 0 | 0.06% | 0.00% | 0.06% |
| 1 | 0.90% | 0.02% | 0.92% |
| 2 | 12.63% | 0.14% | 12.77% |
| 3 | 45.16% | 0.30% | 45.46% |
| 4 | 31.66% | 0.19% | 31.84% |
| 5 | 7.32% | 0.09% | 7.41% |
| 6 | 1.24% | 0.02% | 1.26% |
| 7 | 0.18% | 0.00% | 0.18% |
| 8 | 0.06% | 0.00% | 0.06% |
| 9 | 0.03% | 0.00% | 0.03% |
| 10 | 0.01% | 0.00% | 0.01% |
| 11 | 0.00% | 0.00% | 0.00% |
| Total | 99.25% | 0.75% | 100.00% |
# cross-tab of bedrooms and condition
house_data %>%
tabyl(bedrooms, condition) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages(denominator = "all") %>%
adorn_pct_formatting(digits = 2) %>%
adorn_title() %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
row_spec(1, background = "#e5f5e0")| condition | ||||||
|---|---|---|---|---|---|---|
| bedrooms | 1 | 2 | 3 | 4 | 5 | Total |
| 0 | 0.00% | 0.00% | 0.05% | 0.00% | 0.00% | 0.06% |
| 1 | 0.02% | 0.05% | 0.57% | 0.22% | 0.06% | 0.92% |
| 2 | 0.06% | 0.24% | 8.23% | 3.32% | 0.93% | 12.77% |
| 3 | 0.04% | 0.32% | 29.19% | 12.54% | 3.37% | 45.46% |
| 4 | 0.02% | 0.17% | 21.19% | 7.78% | 2.68% | 31.84% |
| 5 | 0.00% | 0.00% | 4.77% | 1.93% | 0.70% | 7.41% |
| 6 | 0.00% | 0.01% | 0.73% | 0.40% | 0.11% | 1.26% |
| 7 | 0.00% | 0.00% | 0.12% | 0.04% | 0.02% | 0.18% |
| 8 | 0.00% | 0.00% | 0.04% | 0.01% | 0.01% | 0.06% |
| 9 | 0.00% | 0.00% | 0.03% | 0.00% | 0.00% | 0.03% |
| 10 | 0.00% | 0.00% | 0.00% | 0.01% | 0.00% | 0.01% |
| 11 | 0.00% | 0.00% | 0.00% | 0.00% | 0.00% | 0.00% |
| Total | 0.14% | 0.80% | 64.92% | 26.28% | 7.87% | 100.00% |
# cross-tab of floors and waterfront
house_data %>%
tabyl(floors, waterfront) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages(denominator = "all") %>%
adorn_pct_formatting(digits = 2) %>%
adorn_title() %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
row_spec(1, background = "#e5f5e0")| waterfront | |||
|---|---|---|---|
| floors | 0 | 1 | Total |
| 1 | 49.15% | 0.26% | 49.41% |
| 1.5 | 8.74% | 0.10% | 8.84% |
| 2 | 37.78% | 0.35% | 38.13% |
| 2.5 | 0.74% | 0.01% | 0.74% |
| 3 | 2.80% | 0.04% | 2.84% |
| 3.5 | 0.04% | 0.00% | 0.04% |
| Total | 99.25% | 0.75% | 100.00% |
# cross-tab of floors and condition
house_data %>%
tabyl(floors, condition) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages(denominator = "all") %>%
adorn_pct_formatting(digits = 2) %>%
adorn_title() %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
row_spec(1, background = "#e5f5e0")| condition | ||||||
|---|---|---|---|---|---|---|
| floors | 1 | 2 | 3 | 4 | 5 | Total |
| 1 | 0.11% | 0.63% | 25.65% | 17.99% | 5.03% | 49.41% |
| 1.5 | 0.02% | 0.07% | 4.06% | 3.14% | 1.54% | 8.84% |
| 2 | 0.01% | 0.09% | 31.95% | 4.91% | 1.18% | 38.13% |
| 2.5 | 0.00% | 0.00% | 0.48% | 0.18% | 0.09% | 0.74% |
| 3 | 0.00% | 0.00% | 2.76% | 0.06% | 0.02% | 2.84% |
| 3.5 | 0.00% | 0.00% | 0.03% | 0.00% | 0.00% | 0.04% |
| Total | 0.14% | 0.80% | 64.92% | 26.28% | 7.87% | 100.00% |
# cross-tab of waterfront and condition
house_data %>%
tabyl(waterfront, condition) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages(denominator = "all") %>%
adorn_pct_formatting(digits = 2) %>%
adorn_title() %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
row_spec(1, background = "#e5f5e0")| condition | ||||||
|---|---|---|---|---|---|---|
| waterfront | 1 | 2 | 3 | 4 | 5 | Total |
| 0 | 0.13% | 0.79% | 64.50% | 26.05% | 7.77% | 99.25% |
| 1 | 0.00% | 0.00% | 0.42% | 0.23% | 0.09% | 0.75% |
| Total | 0.14% | 0.80% | 64.92% | 26.28% | 7.87% | 100.00% |
#Function for heatmap
heat_map <- function (x, y,y_title,x_title){
x<-enquo(x)
y<-enquo(y)
house_data %>%
group_by(!!x,!!y) %>%
summarise(count = n()) %>%
ggplot(aes(!!x,!!y)) +
geom_tile(aes(fill = -count))+
ylab(y_title) + xlab(x_title)+
theme_classic() +
scale_fill_continuous(guide = guide_legend(title = "Count")) +
scale_fill_gradient(low = "white", high = "darkolivegreen3")
}
grid.arrange(
heat_map(bedrooms, floors, "floors", "bedrooms"),
heat_map(bedrooms, waterfront, "waterfront", "bedrooms"),
heat_map(bedrooms, condition, "condition", "bedrooms"),
heat_map(bedrooms, renovated, "renovated", "bedrooms"),
heat_map(month_sold, condition, "condition", "month sold"),
nrow=3)The standard measure between quantitative variables is correlation
# Find correlation of quantitative variables
cor_plot <- house_data %>%
select(selling_price, price_per_sqft, sqft_living, yr_built, lat, long)
cor(cor_plot) ## selling_price price_per_sqft sqft_living yr_built
## selling_price 1.00000000 0.5547069 0.70204664 0.05402275
## price_per_sqft 0.55470685 1.0000000 -0.09229020 -0.28983503
## sqft_living 0.70204664 -0.0922902 1.00000000 0.31803659
## yr_built 0.05402275 -0.2898350 0.31803659 1.00000000
## lat 0.30699843 0.4720228 0.05255216 -0.14809274
## long 0.02163710 -0.2359974 0.24020927 0.40933763
## lat long
## selling_price 0.30699843 0.0216371
## price_per_sqft 0.47202276 -0.2359974
## sqft_living 0.05255216 0.2402093
## yr_built -0.14809274 0.4093376
## lat 1.00000000 -0.1354811
## long -0.13548114 1.0000000
bedrooms_bathrooms_average_price <- house_data %>%
group_by(bedrooms, bathrooms) %>%
summarise(Total_Revenue = sum(selling_price)) %>%
spread(bedrooms, Total_Revenue) %>%
as.data.frame()
bedrooms_bathrooms_average_price %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
row_spec(1, background = "#e5f5e0")| bathrooms | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3642600 | 839000 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| 0.5 | NA | 255000 | 382000 | NA | 312500 | NA | NA | NA | NA | NA | NA | NA |
| 0.75 | 265000 | 6778450 | 8515351 | 4984201 | 662500 | NA | NA | NA | NA | NA | NA | NA |
| 1 | 228000 | 43694934 | 542297424 | 598408675 | 127128723 | 21209082 | 3545000 | 291000 | NA | NA | NA | NA |
| 1.25 | NA | 1763500 | 1406950 | 2420500 | NA | NA | NA | NA | NA | NA | NA | NA |
| 1.5 | 288000 | 3908950 | 120575703 | 330098904 | 112362964 | 21803990 | 1956400 | 885000 | NA | NA | NA | NA |
| 1.75 | NA | 1464200 | 143259595 | 814396658 | 352314065 | 65804785 | 8644000 | NA | NA | NA | NA | NA |
| 2 | NA | 1783000 | 96844945 | 468949131 | 249462431 | 54954450 | 11083200 | NA | NA | NA | 650000 | NA |
| 2.25 | NA | 1744900 | 61083264 | 525264833 | 415407291 | 76428068 | 10599000 | 1909000 | NA | NA | NA | NA |
| 2.5 | 899950 | 979000 | 106556678 | 1219588146 | 1455709408 | 175781842 | 17255170 | 879000 | 700000 | NA | NA | NA |
| 2.75 | NA | NA | 11018842 | 177977194 | 420887477 | 141885906 | 25873550 | 2060000 | 2670000 | NA | NA | NA |
| 3 | NA | NA | 8379650 | 137225819 | 234059206 | 118241055 | 28113940 | 3655000 | 948000 | 1634000 | 660000 | 520000 |
| 3.25 | NA | NA | 6933200 | 140625098 | 281538935 | 131455548 | 10128350 | 419000 | 430000 | NA | NA | NA |
| 3.5 | NA | NA | 535000 | 113443030 | 393349682 | 151988257 | 14947160 | 5010000 | 1970000 | NA | NA | NA |
| 3.75 | NA | NA | NA | 15422600 | 101730616 | 55468550 | 10475694 | 1663160 | 808000 | NA | NA | NA |
| 4 | NA | NA | NA | 12668000 | 77074010 | 62900750 | 10715750 | 3449957 | 4200000 | 1400000 | NA | NA |
| 4.25 | NA | NA | NA | 10106150 | 60567790 | 34223240 | 12365000 | 3274000 | NA | NA | NA | NA |
| 4.5 | NA | NA | NA | 8685000 | 44919910 | 52449213 | 20822899 | 4605000 | NA | 1879999 | NA | NA |
| 4.75 | NA | NA | NA | NA | 11197990 | 24147100 | 7960000 | 3184900 | NA | NA | NA | NA |
| 5 | NA | NA | NA | NA | 12441530 | 15438800 | 6773000 | NA | 490000 | NA | NA | NA |
| 5.25 | NA | NA | NA | NA | 8115000 | 12141800 | 2218500 | NA | NA | NA | 1148000 | NA |
| 5.5 | NA | NA | NA | NA | 11775000 | 12208000 | NA | 1240000 | NA | NA | NA | NA |
| 5.75 | NA | NA | NA | NA | 1940000 | 7490000 | NA | 540000 | NA | NA | NA | NA |
| 6 | NA | NA | NA | NA | NA | 10231000 | 5300000 | NA | 2150000 | NA | NA | NA |
| 6.25 | NA | NA | NA | NA | NA | 6188000 | NA | NA | NA | NA | NA | NA |
| 6.5 | NA | NA | NA | NA | NA | 2238890 | 1180000 | NA | NA | NA | NA | NA |
| 6.75 | NA | NA | NA | NA | NA | 4668000 | NA | 800000 | NA | NA | NA | NA |
| 7.5 | NA | NA | NA | NA | NA | NA | NA | NA | NA | 450000 | NA | NA |
| 7.75 | NA | NA | NA | NA | NA | NA | 6885000 | NA | NA | NA | NA | NA |
| 8 | NA | NA | NA | NA | NA | NA | 7700000 | 2280000 | NA | NA | NA | NA |
bedrooms_bathrooms_average_sqft <- house_data %>%
group_by(bedrooms, bathrooms) %>%
summarise(Average_sqft = mean(sqft_living)) %>%
spread(bedrooms, Average_sqft) %>%
as.data.frame()
bedrooms_bathrooms_average_sqft %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
row_spec(1, background = "#e5f5e0")| bathrooms | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2058.286 | 653.3333 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| 0.5 | NA | 880.0000 | 880.0000 | NA | 2300.000 | NA | NA | NA | NA | NA | NA | NA |
| 0.75 | 384.000 | 686.2963 | 907.7692 | 1165.000 | 1700.000 | NA | NA | NA | NA | NA | NA | NA |
| 1 | 390.000 | 823.5507 | 1038.0860 | 1253.724 | 1528.338 | 1810.698 | 1626.667 | 2350.000 | NA | NA | NA | NA |
| 1.25 | NA | 1455.0000 | 1110.0000 | 1872.500 | NA | NA | NA | NA | NA | NA | NA | NA |
| 1.5 | 1430.000 | 1111.5000 | 1275.2381 | 1524.613 | 1824.933 | 2063.333 | 2006.667 | 2665.000 | NA | NA | NA | NA |
| 1.75 | NA | 1055.0000 | 1556.2237 | 1707.221 | 2010.549 | 2185.239 | 2346.875 | NA | NA | NA | NA | NA |
| 2 | NA | 1565.0000 | 1456.4907 | 1723.030 | 1967.933 | 2223.545 | 2353.333 | NA | NA | NA | 3610 | NA |
| 2.25 | NA | 1840.0000 | 1575.2627 | 1917.628 | 2341.781 | 2673.362 | 3093.067 | 3016.667 | NA | NA | NA | NA |
| 2.5 | 1863.333 | 1900.0000 | 1744.4975 | 2138.173 | 2601.767 | 2775.725 | 3011.241 | 2260.000 | 2280 | NA | NA | NA |
| 2.75 | NA | NA | 1895.3500 | 2398.142 | 2727.762 | 2829.262 | 3221.290 | 2943.333 | 3120 | NA | NA | NA |
| 3 | NA | NA | 1950.7692 | 2426.904 | 2817.850 | 2946.577 | 2836.978 | 3696.667 | 3345 | 3250 | 2920 | 3000 |
| 3.25 | NA | NA | 2212.5000 | 2442.457 | 3561.059 | 3678.760 | 3673.333 | 4340.000 | 4300 | NA | NA | NA |
| 3.5 | NA | NA | 2560.0000 | 2653.259 | 3571.253 | 3660.112 | 3549.588 | 3540.000 | 4440 | NA | NA | NA |
| 3.75 | NA | NA | NA | 3336.471 | 3797.436 | 3987.500 | 3586.154 | 3705.000 | 3460 | NA | NA | NA |
| 4 | NA | NA | NA | 3374.545 | 4173.586 | 4196.375 | 3765.455 | 3078.000 | 5865 | 4620 | NA | NA |
| 4.25 | NA | NA | NA | 4076.667 | 4593.868 | 4514.800 | 5487.500 | 4170.000 | NA | NA | NA | NA |
| 4.5 | NA | NA | NA | 4276.000 | 4461.719 | 4703.343 | 3900.087 | 4880.000 | NA | 3740 | NA | NA |
| 4.75 | NA | NA | NA | NA | 4532.143 | 5649.545 | 5596.667 | 5340.000 | NA | NA | NA | NA |
| 5 | NA | NA | NA | NA | 5183.571 | 5372.000 | 4200.000 | NA | 2800 | NA | NA | NA |
| 5.25 | NA | NA | NA | NA | 5332.000 | 5335.000 | 4206.667 | NA | NA | NA | 4590 | NA |
| 5.5 | NA | NA | NA | NA | 6430.000 | 6570.000 | NA | 6630.000 | NA | NA | NA | NA |
| 5.75 | NA | NA | NA | NA | 7220.000 | 8465.000 | NA | 3700.000 | NA | NA | NA | NA |
| 6 | NA | NA | NA | NA | NA | 6732.500 | 7390.000 | NA | 4340 | NA | NA | NA |
| 6.25 | NA | NA | NA | NA | NA | 8345.000 | NA | NA | NA | NA | NA | NA |
| 6.5 | NA | NA | NA | NA | NA | 7270.000 | 6260.000 | NA | NA | NA | NA | NA |
| 6.75 | NA | NA | NA | NA | NA | 9640.000 | NA | 7480.000 | NA | NA | NA | NA |
| 7.5 | NA | NA | NA | NA | NA | NA | NA | NA | NA | 4050 | NA | NA |
| 7.75 | NA | NA | NA | NA | NA | NA | 9890.000 | NA | NA | NA | NA | NA |
| 8 | NA | NA | NA | NA | NA | NA | 12050.000 | 13540.000 | NA | NA | NA | NA |
# how does revenue vary by date sold and what is the impact of waterfront view and renovation on their revenue
house_data %>%
mutate(waterfront = ifelse(waterfront == 1, "Waterfront View", "No Waterfront View"),
renovated = ifelse(renovated ==1, "Renovated", "Not Renovated")) %>%
group_by(renovated, waterfront) %>%
ggplot(aes(x = date_sold, y = selling_price/1000)) +
geom_line(color = "darkolivegreen3") +
theme_classic() +
facet_grid(renovated ~ waterfront) +
labs(y = 'Selling price (, 1000 $)', x = 'Date Sold')# how do sales vary by sqft_living, sqft_lot and house condition
sqft_living_lot_condition <- ggplot(house_data, aes(x = sqft_living, y = sqft_lot, color = condition)) +
geom_point(alpha = 0.5) +
ylim(0, 1000000) +
geom_smooth(method=lm, se=FALSE, color="black") +
theme_classic()+
theme(legend.title = element_text(size=10),
plot.title = element_text(hjust = 0, face = 'bold',color = 'black'),
plot.subtitle = element_text(face = "italic")) +
labs(x = 'Living Area (sq.ft)', y = 'Lot Area (sq.ft)', title = "Sales are more for houses having good condition",
subtitle = "House built in 1900 - 2015") +
guides(color = guide_legend(title = 'Condition')) +
scale_x_continuous(breaks=seq(0, 12000, 2000)) +
scale_color_brewer(palette="Paired")
sqft_living_lot_condition# how do sales vary by sqft_living, sqft_lot and number of bedrooms
ggplot(house_data, aes(x = sqft_living, y = sqft_lot, color = bedrooms)) +
geom_point(alpha = 0.5) +
ylim(0, 1000000) +
geom_smooth(method=lm, se=FALSE, color="black") +
theme_classic()+
theme(legend.title = element_text(size=10),
plot.title = element_text(hjust = 0, face = 'bold',color = 'black'),
plot.subtitle = element_text(face = "italic")) +
labs(x = 'Living Area (sq.ft)', y = 'Lot Area (sq.ft)', title = "House Sales in King County, USA",
subtitle = "House built in 1900 - 2015") +
guides(color = guide_legend(title = 'Bedrooms')) +
scale_x_continuous(breaks=seq(0, 12000, 2000)) +
scale_color_brewer(palette="Paired")# Linear regression
mod <- lm(log(price_per_sqft) ~ date_sold + bedrooms + sqft_lot + floors + condition + grade + zipcode + yr_built + waterfront + renovated + lat + long, data = house_data)
# Review output
summary(mod) ##
## Call:
## lm(formula = log(price_per_sqft) ~ date_sold + bedrooms + sqft_lot +
## floors + condition + grade + zipcode + yr_built + waterfront +
## renovated + lat + long, data = house_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.23272 -0.12619 0.00437 0.13033 1.12689
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.682e+01 8.257e+00 -8.093 6.13e-16 ***
## date_sold 2.485e-09 1.511e-10 16.449 < 2e-16 ***
## bedrooms1 2.186e-01 6.486e-02 3.370 0.000752 ***
## bedrooms2 7.486e-02 6.324e-02 1.184 0.236487
## bedrooms3 -5.773e-02 6.314e-02 -0.914 0.360570
## bedrooms4 -1.592e-01 6.318e-02 -2.520 0.011737 *
## bedrooms5 -2.458e-01 6.337e-02 -3.879 0.000105 ***
## bedrooms6 -3.207e-01 6.449e-02 -4.973 6.63e-07 ***
## bedrooms7 -4.464e-01 7.220e-02 -6.183 6.42e-10 ***
## bedrooms8 -4.593e-01 8.683e-02 -5.290 1.24e-07 ***
## bedrooms9 -5.899e-01 1.088e-01 -5.423 5.93e-08 ***
## bedrooms10 -6.216e-01 1.398e-01 -4.448 8.73e-06 ***
## bedrooms11 -2.998e-01 2.251e-01 -1.332 0.182911
## sqft_lot 5.009e-07 3.900e-08 12.844 < 2e-16 ***
## floors1.5 -1.002e-02 5.820e-03 -1.722 0.085114 .
## floors2 -3.518e-02 4.301e-03 -8.178 3.04e-16 ***
## floors2.5 -4.123e-02 1.768e-02 -2.332 0.019710 *
## floors3 -6.283e-02 1.076e-02 -5.836 5.41e-09 ***
## floors3.5 -6.795e-02 7.740e-02 -0.878 0.379962
## condition2 1.389e-01 4.353e-02 3.192 0.001416 **
## condition3 2.430e-01 4.054e-02 5.995 2.06e-09 ***
## condition4 2.745e-01 4.056e-02 6.769 1.33e-11 ***
## condition5 3.187e-01 4.079e-02 7.813 5.83e-15 ***
## grade3 -7.571e-01 2.620e-01 -2.890 0.003854 **
## grade4 -7.225e-01 2.318e-01 -3.118 0.001826 **
## grade5 -9.144e-01 2.298e-01 -3.979 6.95e-05 ***
## grade6 -9.321e-01 2.296e-01 -4.059 4.95e-05 ***
## grade7 -9.634e-01 2.296e-01 -4.195 2.74e-05 ***
## grade8 -9.472e-01 2.297e-01 -4.125 3.73e-05 ***
## grade9 -8.822e-01 2.297e-01 -3.841 0.000123 ***
## grade10 -8.357e-01 2.298e-01 -3.637 0.000276 ***
## grade11 -7.598e-01 2.300e-01 -3.304 0.000955 ***
## grade12 -6.779e-01 2.307e-01 -2.939 0.003299 **
## grade13 -4.753e-01 2.375e-01 -2.001 0.045391 *
## zipcode98002 -5.045e-03 1.942e-02 -0.260 0.795007
## zipcode98003 1.037e-02 1.736e-02 0.597 0.550303
## zipcode98004 1.005e+00 3.157e-02 31.841 < 2e-16 ***
## zipcode98005 6.287e-01 3.374e-02 18.636 < 2e-16 ***
## zipcode98006 5.926e-01 2.755e-02 21.509 < 2e-16 ***
## zipcode98007 5.939e-01 3.485e-02 17.043 < 2e-16 ***
## zipcode98008 6.014e-01 3.309e-02 18.176 < 2e-16 ***
## zipcode98010 3.229e-01 2.959e-02 10.911 < 2e-16 ***
## zipcode98011 2.584e-01 4.301e-02 6.009 1.90e-09 ***
## zipcode98014 2.857e-01 4.725e-02 6.046 1.51e-09 ***
## zipcode98019 2.423e-01 4.662e-02 5.198 2.03e-07 ***
## zipcode98022 2.269e-01 2.573e-02 8.819 < 2e-16 ***
## zipcode98023 -6.497e-02 1.597e-02 -4.068 4.75e-05 ***
## zipcode98024 4.535e-01 4.159e-02 10.905 < 2e-16 ***
## zipcode98027 4.885e-01 2.830e-02 17.263 < 2e-16 ***
## zipcode98028 2.214e-01 4.177e-02 5.302 1.16e-07 ***
## zipcode98029 6.154e-01 3.234e-02 19.032 < 2e-16 ***
## zipcode98030 5.268e-02 1.908e-02 2.761 0.005774 **
## zipcode98031 6.780e-02 1.987e-02 3.412 0.000646 ***
## zipcode98032 -3.875e-02 2.307e-02 -1.679 0.093071 .
## zipcode98033 6.560e-01 3.584e-02 18.303 < 2e-16 ***
## zipcode98034 3.991e-01 3.843e-02 10.386 < 2e-16 ***
## zipcode98038 2.249e-01 2.144e-02 10.487 < 2e-16 ***
## zipcode98039 1.145e+00 4.270e-02 26.820 < 2e-16 ***
## zipcode98040 7.977e-01 2.790e-02 28.595 < 2e-16 ***
## zipcode98042 1.096e-01 1.827e-02 5.996 2.05e-09 ***
## zipcode98045 4.811e-01 3.963e-02 12.138 < 2e-16 ***
## zipcode98052 5.335e-01 3.659e-02 14.579 < 2e-16 ***
## zipcode98053 4.815e-01 3.919e-02 12.284 < 2e-16 ***
## zipcode98055 9.511e-02 2.213e-02 4.297 1.74e-05 ***
## zipcode98056 2.596e-01 2.406e-02 10.790 < 2e-16 ***
## zipcode98058 1.505e-01 2.093e-02 7.192 6.58e-13 ***
## zipcode98059 3.060e-01 2.359e-02 12.967 < 2e-16 ***
## zipcode98065 4.347e-01 3.648e-02 11.915 < 2e-16 ***
## zipcode98070 1.894e-01 2.765e-02 6.849 7.65e-12 ***
## zipcode98072 3.338e-01 4.279e-02 7.801 6.43e-15 ***
## zipcode98074 5.096e-01 3.464e-02 14.711 < 2e-16 ***
## zipcode98075 5.360e-01 3.330e-02 16.097 < 2e-16 ***
## zipcode98077 3.396e-01 4.451e-02 7.628 2.48e-14 ***
## zipcode98092 7.278e-02 1.736e-02 4.194 2.76e-05 ***
## zipcode98102 8.172e-01 3.690e-02 22.149 < 2e-16 ***
## zipcode98103 6.522e-01 3.466e-02 18.819 < 2e-16 ***
## zipcode98105 7.850e-01 3.556e-02 22.077 < 2e-16 ***
## zipcode98106 2.257e-01 2.563e-02 8.804 < 2e-16 ***
## zipcode98107 6.873e-01 3.570e-02 19.255 < 2e-16 ***
## zipcode98108 2.196e-01 2.829e-02 7.764 8.61e-15 ***
## zipcode98109 8.223e-01 3.674e-02 22.382 < 2e-16 ***
## zipcode98112 8.665e-01 3.261e-02 26.573 < 2e-16 ***
## zipcode98115 6.271e-01 3.519e-02 17.821 < 2e-16 ***
## zipcode98116 6.031e-01 2.857e-02 21.108 < 2e-16 ***
## zipcode98117 6.000e-01 3.563e-02 16.841 < 2e-16 ***
## zipcode98118 3.483e-01 2.500e-02 13.931 < 2e-16 ***
## zipcode98119 8.162e-01 3.467e-02 23.539 < 2e-16 ***
## zipcode98122 6.968e-01 3.093e-02 22.527 < 2e-16 ***
## zipcode98125 3.807e-01 3.803e-02 10.009 < 2e-16 ***
## zipcode98126 4.189e-01 2.626e-02 15.953 < 2e-16 ***
## zipcode98133 2.576e-01 3.927e-02 6.561 5.47e-11 ***
## zipcode98136 5.599e-01 2.691e-02 20.804 < 2e-16 ***
## zipcode98144 5.373e-01 2.876e-02 18.685 < 2e-16 ***
## zipcode98146 1.635e-01 2.407e-02 6.792 1.14e-11 ***
## zipcode98148 1.042e-01 3.277e-02 3.179 0.001480 **
## zipcode98155 2.355e-01 4.083e-02 5.768 8.12e-09 ***
## zipcode98166 2.054e-01 2.204e-02 9.322 < 2e-16 ***
## zipcode98168 -2.832e-02 2.331e-02 -1.215 0.224434
## zipcode98177 3.713e-01 4.098e-02 9.061 < 2e-16 ***
## zipcode98178 6.972e-02 2.404e-02 2.900 0.003732 **
## zipcode98188 3.145e-02 2.469e-02 1.274 0.202718
## zipcode98198 2.797e-02 1.869e-02 1.497 0.134436
## zipcode98199 6.427e-01 3.380e-02 19.017 < 2e-16 ***
## yr_built -3.692e-04 9.026e-05 -4.090 4.32e-05 ***
## waterfront1 5.898e-01 1.764e-02 33.442 < 2e-16 ***
## renovated1 3.972e-02 7.866e-03 5.049 4.47e-07 ***
## lat 4.240e-01 8.513e-02 4.981 6.38e-07 ***
## long -4.071e-01 6.119e-02 -6.652 2.96e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2156 on 21504 degrees of freedom
## Multiple R-squared: 0.7033, Adjusted R-squared: 0.7018
## F-statistic: 476.4 on 107 and 21504 DF, p-value: < 2.2e-16
# plot residuals to check for patterns
par(mfrow = c(2, 2))
plot(mod$fitted.values, mod$residuals)
plot(yr_built, mod$residuals)
plot(sqft_lot, mod$residuals)
plot(date_sold, mod$residuals)# Are price per sqft same across all regions?
# Not tidyverse so have to use base code
chisq.test(table(price_per_sqft, zipcode))##
## Pearson's Chi-squared test
##
## data: table(price_per_sqft, zipcode)
## X-squared = 1157700, df = 1146800, p-value = 3.513e-13
# Pull out the coefficients and confidence interval for table and graph
coeff <- summary(mod)$coefficients # get coefficients and related stats
coeff_CI <- as.data.frame(cbind(coeff[-1, ], confint(mod)[-1, ])) # find and bind CI, remove Intercept
# Rename results data frame
names(coeff_CI) <- c("estimate", "se", "t", "pval","low_CI","high_CI")
kable(round(coeff_CI,3)) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
row_spec(1, background = "#e5f5e0")| estimate | se | t | pval | low_CI | high_CI | |
|---|---|---|---|---|---|---|
| date_sold | 0.000 | 0.000 | 16.449 | 0.000 | 0.000 | 0.000 |
| bedrooms1 | 0.219 | 0.065 | 3.370 | 0.001 | 0.091 | 0.346 |
| bedrooms2 | 0.075 | 0.063 | 1.184 | 0.236 | -0.049 | 0.199 |
| bedrooms3 | -0.058 | 0.063 | -0.914 | 0.361 | -0.182 | 0.066 |
| bedrooms4 | -0.159 | 0.063 | -2.520 | 0.012 | -0.283 | -0.035 |
| bedrooms5 | -0.246 | 0.063 | -3.879 | 0.000 | -0.370 | -0.122 |
| bedrooms6 | -0.321 | 0.064 | -4.973 | 0.000 | -0.447 | -0.194 |
| bedrooms7 | -0.446 | 0.072 | -6.183 | 0.000 | -0.588 | -0.305 |
| bedrooms8 | -0.459 | 0.087 | -5.290 | 0.000 | -0.630 | -0.289 |
| bedrooms9 | -0.590 | 0.109 | -5.423 | 0.000 | -0.803 | -0.377 |
| bedrooms10 | -0.622 | 0.140 | -4.448 | 0.000 | -0.896 | -0.348 |
| bedrooms11 | -0.300 | 0.225 | -1.332 | 0.183 | -0.741 | 0.141 |
| sqft_lot | 0.000 | 0.000 | 12.844 | 0.000 | 0.000 | 0.000 |
| floors1.5 | -0.010 | 0.006 | -1.722 | 0.085 | -0.021 | 0.001 |
| floors2 | -0.035 | 0.004 | -8.178 | 0.000 | -0.044 | -0.027 |
| floors2.5 | -0.041 | 0.018 | -2.332 | 0.020 | -0.076 | -0.007 |
| floors3 | -0.063 | 0.011 | -5.836 | 0.000 | -0.084 | -0.042 |
| floors3.5 | -0.068 | 0.077 | -0.878 | 0.380 | -0.220 | 0.084 |
| condition2 | 0.139 | 0.044 | 3.192 | 0.001 | 0.054 | 0.224 |
| condition3 | 0.243 | 0.041 | 5.995 | 0.000 | 0.164 | 0.323 |
| condition4 | 0.275 | 0.041 | 6.769 | 0.000 | 0.195 | 0.354 |
| condition5 | 0.319 | 0.041 | 7.813 | 0.000 | 0.239 | 0.399 |
| grade3 | -0.757 | 0.262 | -2.890 | 0.004 | -1.271 | -0.244 |
| grade4 | -0.723 | 0.232 | -3.118 | 0.002 | -1.177 | -0.268 |
| grade5 | -0.914 | 0.230 | -3.979 | 0.000 | -1.365 | -0.464 |
| grade6 | -0.932 | 0.230 | -4.059 | 0.000 | -1.382 | -0.482 |
| grade7 | -0.963 | 0.230 | -4.195 | 0.000 | -1.413 | -0.513 |
| grade8 | -0.947 | 0.230 | -4.125 | 0.000 | -1.397 | -0.497 |
| grade9 | -0.882 | 0.230 | -3.841 | 0.000 | -1.332 | -0.432 |
| grade10 | -0.836 | 0.230 | -3.637 | 0.000 | -1.286 | -0.385 |
| grade11 | -0.760 | 0.230 | -3.304 | 0.001 | -1.210 | -0.309 |
| grade12 | -0.678 | 0.231 | -2.939 | 0.003 | -1.130 | -0.226 |
| grade13 | -0.475 | 0.238 | -2.001 | 0.045 | -0.941 | -0.010 |
| zipcode98002 | -0.005 | 0.019 | -0.260 | 0.795 | -0.043 | 0.033 |
| zipcode98003 | 0.010 | 0.017 | 0.597 | 0.550 | -0.024 | 0.044 |
| zipcode98004 | 1.005 | 0.032 | 31.841 | 0.000 | 0.943 | 1.067 |
| zipcode98005 | 0.629 | 0.034 | 18.636 | 0.000 | 0.563 | 0.695 |
| zipcode98006 | 0.593 | 0.028 | 21.509 | 0.000 | 0.539 | 0.647 |
| zipcode98007 | 0.594 | 0.035 | 17.043 | 0.000 | 0.526 | 0.662 |
| zipcode98008 | 0.601 | 0.033 | 18.176 | 0.000 | 0.537 | 0.666 |
| zipcode98010 | 0.323 | 0.030 | 10.911 | 0.000 | 0.265 | 0.381 |
| zipcode98011 | 0.258 | 0.043 | 6.009 | 0.000 | 0.174 | 0.343 |
| zipcode98014 | 0.286 | 0.047 | 6.046 | 0.000 | 0.193 | 0.378 |
| zipcode98019 | 0.242 | 0.047 | 5.198 | 0.000 | 0.151 | 0.334 |
| zipcode98022 | 0.227 | 0.026 | 8.819 | 0.000 | 0.176 | 0.277 |
| zipcode98023 | -0.065 | 0.016 | -4.068 | 0.000 | -0.096 | -0.034 |
| zipcode98024 | 0.454 | 0.042 | 10.905 | 0.000 | 0.372 | 0.535 |
| zipcode98027 | 0.489 | 0.028 | 17.263 | 0.000 | 0.433 | 0.544 |
| zipcode98028 | 0.221 | 0.042 | 5.302 | 0.000 | 0.140 | 0.303 |
| zipcode98029 | 0.615 | 0.032 | 19.032 | 0.000 | 0.552 | 0.679 |
| zipcode98030 | 0.053 | 0.019 | 2.761 | 0.006 | 0.015 | 0.090 |
| zipcode98031 | 0.068 | 0.020 | 3.412 | 0.001 | 0.029 | 0.107 |
| zipcode98032 | -0.039 | 0.023 | -1.679 | 0.093 | -0.084 | 0.006 |
| zipcode98033 | 0.656 | 0.036 | 18.303 | 0.000 | 0.586 | 0.726 |
| zipcode98034 | 0.399 | 0.038 | 10.386 | 0.000 | 0.324 | 0.474 |
| zipcode98038 | 0.225 | 0.021 | 10.487 | 0.000 | 0.183 | 0.267 |
| zipcode98039 | 1.145 | 0.043 | 26.820 | 0.000 | 1.061 | 1.229 |
| zipcode98040 | 0.798 | 0.028 | 28.595 | 0.000 | 0.743 | 0.852 |
| zipcode98042 | 0.110 | 0.018 | 5.996 | 0.000 | 0.074 | 0.145 |
| zipcode98045 | 0.481 | 0.040 | 12.138 | 0.000 | 0.403 | 0.559 |
| zipcode98052 | 0.533 | 0.037 | 14.579 | 0.000 | 0.462 | 0.605 |
| zipcode98053 | 0.481 | 0.039 | 12.284 | 0.000 | 0.405 | 0.558 |
| zipcode98055 | 0.095 | 0.022 | 4.297 | 0.000 | 0.052 | 0.138 |
| zipcode98056 | 0.260 | 0.024 | 10.790 | 0.000 | 0.212 | 0.307 |
| zipcode98058 | 0.151 | 0.021 | 7.192 | 0.000 | 0.110 | 0.192 |
| zipcode98059 | 0.306 | 0.024 | 12.967 | 0.000 | 0.260 | 0.352 |
| zipcode98065 | 0.435 | 0.036 | 11.915 | 0.000 | 0.363 | 0.506 |
| zipcode98070 | 0.189 | 0.028 | 6.849 | 0.000 | 0.135 | 0.244 |
| zipcode98072 | 0.334 | 0.043 | 7.801 | 0.000 | 0.250 | 0.418 |
| zipcode98074 | 0.510 | 0.035 | 14.711 | 0.000 | 0.442 | 0.578 |
| zipcode98075 | 0.536 | 0.033 | 16.097 | 0.000 | 0.471 | 0.601 |
| zipcode98077 | 0.340 | 0.045 | 7.628 | 0.000 | 0.252 | 0.427 |
| zipcode98092 | 0.073 | 0.017 | 4.194 | 0.000 | 0.039 | 0.107 |
| zipcode98102 | 0.817 | 0.037 | 22.149 | 0.000 | 0.745 | 0.890 |
| zipcode98103 | 0.652 | 0.035 | 18.819 | 0.000 | 0.584 | 0.720 |
| zipcode98105 | 0.785 | 0.036 | 22.077 | 0.000 | 0.715 | 0.855 |
| zipcode98106 | 0.226 | 0.026 | 8.804 | 0.000 | 0.175 | 0.276 |
| zipcode98107 | 0.687 | 0.036 | 19.255 | 0.000 | 0.617 | 0.757 |
| zipcode98108 | 0.220 | 0.028 | 7.764 | 0.000 | 0.164 | 0.275 |
| zipcode98109 | 0.822 | 0.037 | 22.382 | 0.000 | 0.750 | 0.894 |
| zipcode98112 | 0.866 | 0.033 | 26.573 | 0.000 | 0.803 | 0.930 |
| zipcode98115 | 0.627 | 0.035 | 17.821 | 0.000 | 0.558 | 0.696 |
| zipcode98116 | 0.603 | 0.029 | 21.108 | 0.000 | 0.547 | 0.659 |
| zipcode98117 | 0.600 | 0.036 | 16.841 | 0.000 | 0.530 | 0.670 |
| zipcode98118 | 0.348 | 0.025 | 13.931 | 0.000 | 0.299 | 0.397 |
| zipcode98119 | 0.816 | 0.035 | 23.539 | 0.000 | 0.748 | 0.884 |
| zipcode98122 | 0.697 | 0.031 | 22.527 | 0.000 | 0.636 | 0.757 |
| zipcode98125 | 0.381 | 0.038 | 10.009 | 0.000 | 0.306 | 0.455 |
| zipcode98126 | 0.419 | 0.026 | 15.953 | 0.000 | 0.367 | 0.470 |
| zipcode98133 | 0.258 | 0.039 | 6.561 | 0.000 | 0.181 | 0.335 |
| zipcode98136 | 0.560 | 0.027 | 20.804 | 0.000 | 0.507 | 0.613 |
| zipcode98144 | 0.537 | 0.029 | 18.685 | 0.000 | 0.481 | 0.594 |
| zipcode98146 | 0.164 | 0.024 | 6.792 | 0.000 | 0.116 | 0.211 |
| zipcode98148 | 0.104 | 0.033 | 3.179 | 0.001 | 0.040 | 0.168 |
| zipcode98155 | 0.236 | 0.041 | 5.768 | 0.000 | 0.155 | 0.316 |
| zipcode98166 | 0.205 | 0.022 | 9.322 | 0.000 | 0.162 | 0.249 |
| zipcode98168 | -0.028 | 0.023 | -1.215 | 0.224 | -0.074 | 0.017 |
| zipcode98177 | 0.371 | 0.041 | 9.061 | 0.000 | 0.291 | 0.452 |
| zipcode98178 | 0.070 | 0.024 | 2.900 | 0.004 | 0.023 | 0.117 |
| zipcode98188 | 0.031 | 0.025 | 1.274 | 0.203 | -0.017 | 0.080 |
| zipcode98198 | 0.028 | 0.019 | 1.497 | 0.134 | -0.009 | 0.065 |
| zipcode98199 | 0.643 | 0.034 | 19.017 | 0.000 | 0.576 | 0.709 |
| yr_built | 0.000 | 0.000 | -4.090 | 0.000 | -0.001 | 0.000 |
| waterfront1 | 0.590 | 0.018 | 33.442 | 0.000 | 0.555 | 0.624 |
| renovated1 | 0.040 | 0.008 | 5.049 | 0.000 | 0.024 | 0.055 |
| lat | 0.424 | 0.085 | 4.981 | 0.000 | 0.257 | 0.591 |
| long | -0.407 | 0.061 | -6.652 | 0.000 | -0.527 | -0.287 |
kable(round(coeff_CI[order(coeff_CI$pval, decreasing = FALSE), ], 3)) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
row_spec(1, background = "#e5f5e0")| estimate | se | t | pval | low_CI | high_CI | |
|---|---|---|---|---|---|---|
| waterfront1 | 0.590 | 0.018 | 33.442 | 0.000 | 0.555 | 0.624 |
| zipcode98004 | 1.005 | 0.032 | 31.841 | 0.000 | 0.943 | 1.067 |
| zipcode98040 | 0.798 | 0.028 | 28.595 | 0.000 | 0.743 | 0.852 |
| zipcode98039 | 1.145 | 0.043 | 26.820 | 0.000 | 1.061 | 1.229 |
| zipcode98112 | 0.866 | 0.033 | 26.573 | 0.000 | 0.803 | 0.930 |
| zipcode98119 | 0.816 | 0.035 | 23.539 | 0.000 | 0.748 | 0.884 |
| zipcode98122 | 0.697 | 0.031 | 22.527 | 0.000 | 0.636 | 0.757 |
| zipcode98109 | 0.822 | 0.037 | 22.382 | 0.000 | 0.750 | 0.894 |
| zipcode98102 | 0.817 | 0.037 | 22.149 | 0.000 | 0.745 | 0.890 |
| zipcode98105 | 0.785 | 0.036 | 22.077 | 0.000 | 0.715 | 0.855 |
| zipcode98006 | 0.593 | 0.028 | 21.509 | 0.000 | 0.539 | 0.647 |
| zipcode98116 | 0.603 | 0.029 | 21.108 | 0.000 | 0.547 | 0.659 |
| zipcode98136 | 0.560 | 0.027 | 20.804 | 0.000 | 0.507 | 0.613 |
| zipcode98107 | 0.687 | 0.036 | 19.255 | 0.000 | 0.617 | 0.757 |
| zipcode98029 | 0.615 | 0.032 | 19.032 | 0.000 | 0.552 | 0.679 |
| zipcode98199 | 0.643 | 0.034 | 19.017 | 0.000 | 0.576 | 0.709 |
| zipcode98103 | 0.652 | 0.035 | 18.819 | 0.000 | 0.584 | 0.720 |
| zipcode98144 | 0.537 | 0.029 | 18.685 | 0.000 | 0.481 | 0.594 |
| zipcode98005 | 0.629 | 0.034 | 18.636 | 0.000 | 0.563 | 0.695 |
| zipcode98033 | 0.656 | 0.036 | 18.303 | 0.000 | 0.586 | 0.726 |
| zipcode98008 | 0.601 | 0.033 | 18.176 | 0.000 | 0.537 | 0.666 |
| zipcode98115 | 0.627 | 0.035 | 17.821 | 0.000 | 0.558 | 0.696 |
| zipcode98027 | 0.489 | 0.028 | 17.263 | 0.000 | 0.433 | 0.544 |
| zipcode98007 | 0.594 | 0.035 | 17.043 | 0.000 | 0.526 | 0.662 |
| zipcode98117 | 0.600 | 0.036 | 16.841 | 0.000 | 0.530 | 0.670 |
| date_sold | 0.000 | 0.000 | 16.449 | 0.000 | 0.000 | 0.000 |
| zipcode98075 | 0.536 | 0.033 | 16.097 | 0.000 | 0.471 | 0.601 |
| zipcode98126 | 0.419 | 0.026 | 15.953 | 0.000 | 0.367 | 0.470 |
| zipcode98074 | 0.510 | 0.035 | 14.711 | 0.000 | 0.442 | 0.578 |
| zipcode98052 | 0.533 | 0.037 | 14.579 | 0.000 | 0.462 | 0.605 |
| zipcode98118 | 0.348 | 0.025 | 13.931 | 0.000 | 0.299 | 0.397 |
| zipcode98059 | 0.306 | 0.024 | 12.967 | 0.000 | 0.260 | 0.352 |
| sqft_lot | 0.000 | 0.000 | 12.844 | 0.000 | 0.000 | 0.000 |
| zipcode98053 | 0.481 | 0.039 | 12.284 | 0.000 | 0.405 | 0.558 |
| zipcode98045 | 0.481 | 0.040 | 12.138 | 0.000 | 0.403 | 0.559 |
| zipcode98065 | 0.435 | 0.036 | 11.915 | 0.000 | 0.363 | 0.506 |
| zipcode98010 | 0.323 | 0.030 | 10.911 | 0.000 | 0.265 | 0.381 |
| zipcode98024 | 0.454 | 0.042 | 10.905 | 0.000 | 0.372 | 0.535 |
| zipcode98056 | 0.260 | 0.024 | 10.790 | 0.000 | 0.212 | 0.307 |
| zipcode98038 | 0.225 | 0.021 | 10.487 | 0.000 | 0.183 | 0.267 |
| zipcode98034 | 0.399 | 0.038 | 10.386 | 0.000 | 0.324 | 0.474 |
| zipcode98125 | 0.381 | 0.038 | 10.009 | 0.000 | 0.306 | 0.455 |
| zipcode98166 | 0.205 | 0.022 | 9.322 | 0.000 | 0.162 | 0.249 |
| zipcode98177 | 0.371 | 0.041 | 9.061 | 0.000 | 0.291 | 0.452 |
| zipcode98022 | 0.227 | 0.026 | 8.819 | 0.000 | 0.176 | 0.277 |
| zipcode98106 | 0.226 | 0.026 | 8.804 | 0.000 | 0.175 | 0.276 |
| floors2 | -0.035 | 0.004 | -8.178 | 0.000 | -0.044 | -0.027 |
| condition5 | 0.319 | 0.041 | 7.813 | 0.000 | 0.239 | 0.399 |
| zipcode98072 | 0.334 | 0.043 | 7.801 | 0.000 | 0.250 | 0.418 |
| zipcode98108 | 0.220 | 0.028 | 7.764 | 0.000 | 0.164 | 0.275 |
| zipcode98077 | 0.340 | 0.045 | 7.628 | 0.000 | 0.252 | 0.427 |
| zipcode98058 | 0.151 | 0.021 | 7.192 | 0.000 | 0.110 | 0.192 |
| zipcode98070 | 0.189 | 0.028 | 6.849 | 0.000 | 0.135 | 0.244 |
| zipcode98146 | 0.164 | 0.024 | 6.792 | 0.000 | 0.116 | 0.211 |
| condition4 | 0.275 | 0.041 | 6.769 | 0.000 | 0.195 | 0.354 |
| long | -0.407 | 0.061 | -6.652 | 0.000 | -0.527 | -0.287 |
| zipcode98133 | 0.258 | 0.039 | 6.561 | 0.000 | 0.181 | 0.335 |
| bedrooms7 | -0.446 | 0.072 | -6.183 | 0.000 | -0.588 | -0.305 |
| zipcode98014 | 0.286 | 0.047 | 6.046 | 0.000 | 0.193 | 0.378 |
| zipcode98011 | 0.258 | 0.043 | 6.009 | 0.000 | 0.174 | 0.343 |
| zipcode98042 | 0.110 | 0.018 | 5.996 | 0.000 | 0.074 | 0.145 |
| condition3 | 0.243 | 0.041 | 5.995 | 0.000 | 0.164 | 0.323 |
| floors3 | -0.063 | 0.011 | -5.836 | 0.000 | -0.084 | -0.042 |
| zipcode98155 | 0.236 | 0.041 | 5.768 | 0.000 | 0.155 | 0.316 |
| bedrooms9 | -0.590 | 0.109 | -5.423 | 0.000 | -0.803 | -0.377 |
| zipcode98028 | 0.221 | 0.042 | 5.302 | 0.000 | 0.140 | 0.303 |
| bedrooms8 | -0.459 | 0.087 | -5.290 | 0.000 | -0.630 | -0.289 |
| zipcode98019 | 0.242 | 0.047 | 5.198 | 0.000 | 0.151 | 0.334 |
| renovated1 | 0.040 | 0.008 | 5.049 | 0.000 | 0.024 | 0.055 |
| lat | 0.424 | 0.085 | 4.981 | 0.000 | 0.257 | 0.591 |
| bedrooms6 | -0.321 | 0.064 | -4.973 | 0.000 | -0.447 | -0.194 |
| bedrooms10 | -0.622 | 0.140 | -4.448 | 0.000 | -0.896 | -0.348 |
| zipcode98055 | 0.095 | 0.022 | 4.297 | 0.000 | 0.052 | 0.138 |
| grade7 | -0.963 | 0.230 | -4.195 | 0.000 | -1.413 | -0.513 |
| zipcode98092 | 0.073 | 0.017 | 4.194 | 0.000 | 0.039 | 0.107 |
| grade8 | -0.947 | 0.230 | -4.125 | 0.000 | -1.397 | -0.497 |
| yr_built | 0.000 | 0.000 | -4.090 | 0.000 | -0.001 | 0.000 |
| zipcode98023 | -0.065 | 0.016 | -4.068 | 0.000 | -0.096 | -0.034 |
| grade6 | -0.932 | 0.230 | -4.059 | 0.000 | -1.382 | -0.482 |
| grade5 | -0.914 | 0.230 | -3.979 | 0.000 | -1.365 | -0.464 |
| bedrooms5 | -0.246 | 0.063 | -3.879 | 0.000 | -0.370 | -0.122 |
| grade9 | -0.882 | 0.230 | -3.841 | 0.000 | -1.332 | -0.432 |
| grade10 | -0.836 | 0.230 | -3.637 | 0.000 | -1.286 | -0.385 |
| zipcode98031 | 0.068 | 0.020 | 3.412 | 0.001 | 0.029 | 0.107 |
| bedrooms1 | 0.219 | 0.065 | 3.370 | 0.001 | 0.091 | 0.346 |
| grade11 | -0.760 | 0.230 | -3.304 | 0.001 | -1.210 | -0.309 |
| condition2 | 0.139 | 0.044 | 3.192 | 0.001 | 0.054 | 0.224 |
| zipcode98148 | 0.104 | 0.033 | 3.179 | 0.001 | 0.040 | 0.168 |
| grade4 | -0.723 | 0.232 | -3.118 | 0.002 | -1.177 | -0.268 |
| grade12 | -0.678 | 0.231 | -2.939 | 0.003 | -1.130 | -0.226 |
| zipcode98178 | 0.070 | 0.024 | 2.900 | 0.004 | 0.023 | 0.117 |
| grade3 | -0.757 | 0.262 | -2.890 | 0.004 | -1.271 | -0.244 |
| zipcode98030 | 0.053 | 0.019 | 2.761 | 0.006 | 0.015 | 0.090 |
| bedrooms4 | -0.159 | 0.063 | -2.520 | 0.012 | -0.283 | -0.035 |
| floors2.5 | -0.041 | 0.018 | -2.332 | 0.020 | -0.076 | -0.007 |
| grade13 | -0.475 | 0.238 | -2.001 | 0.045 | -0.941 | -0.010 |
| floors1.5 | -0.010 | 0.006 | -1.722 | 0.085 | -0.021 | 0.001 |
| zipcode98032 | -0.039 | 0.023 | -1.679 | 0.093 | -0.084 | 0.006 |
| zipcode98198 | 0.028 | 0.019 | 1.497 | 0.134 | -0.009 | 0.065 |
| bedrooms11 | -0.300 | 0.225 | -1.332 | 0.183 | -0.741 | 0.141 |
| zipcode98188 | 0.031 | 0.025 | 1.274 | 0.203 | -0.017 | 0.080 |
| zipcode98168 | -0.028 | 0.023 | -1.215 | 0.224 | -0.074 | 0.017 |
| bedrooms2 | 0.075 | 0.063 | 1.184 | 0.236 | -0.049 | 0.199 |
| bedrooms3 | -0.058 | 0.063 | -0.914 | 0.361 | -0.182 | 0.066 |
| floors3.5 | -0.068 | 0.077 | -0.878 | 0.380 | -0.220 | 0.084 |
| zipcode98003 | 0.010 | 0.017 | 0.597 | 0.550 | -0.024 | 0.044 |
| zipcode98002 | -0.005 | 0.019 | -0.260 | 0.795 | -0.043 | 0.033 |
house_good <- house_data %>% select(sqft_living, sqft_lot, price_per_sqft, lat, long)
p1 <- ggplot(house_good, aes(x = sqft_living, y = price_per_sqft)) +
geom_point(color = 'black', size = 0.5) +
geom_smooth(method="lm", color = 'darkolivegreen3', size = 0.5, se = FALSE) +
theme_classic() +
labs(x = 'Sqft living', y = 'Price per sqft (USD)') +
scale_y_continuous(labels = scales::comma)
p2 <- ggplot(house_good, aes(x = sqft_lot, y = price_per_sqft)) +
geom_point(color = 'black', size = 0.5) +
geom_smooth(method="lm", color = 'darkolivegreen3', size = 0.5, se = FALSE) +
theme_classic() +
labs(x = 'Sqft lot', y = 'Price per sqft (USD)') +
scale_y_continuous(labels = scales::comma)
p3 <- ggplot(house_good, aes(x = lat, y = price_per_sqft)) +
geom_point(color = 'black', size = 0.5) +
geom_smooth(method="lm", color = 'darkolivegreen3', size = 0.5) +
theme_classic() +
labs(x = 'Latitude', y = 'Price per sqft (USD)') +
scale_y_continuous(labels = scales::comma)
p4 <- ggplot(house_good, aes(x = long, y = price_per_sqft)) +
geom_point(color = 'black', size = 0.5) +
geom_smooth(method="lm", color = 'darkolivegreen3', size = 0.5) +
theme_classic() +
labs(x = 'Longitude', y = 'Price per sqft (USD)') +
scale_y_continuous(labels = scales::comma)
grid.arrange(p1,p2,p3,p4, nrow = 2,
top = "House Sales in King County, USA")# Reorder for more clarity
(g1 <- ggplot(coeff_CI, aes(x = estimate, y = reorder(row.names(coeff_CI),desc(pval)))) +
geom_point(size = 3) +
xlim(min(coeff_CI$low_CI), max(coeff_CI$high_CI)) +
ylab("Variable") +
xlab("Coefficient") +
theme_bw() +
geom_segment(aes(yend = reorder(row.names(coeff_CI),desc(pval))),
xend = coeff_CI$high_CI, color = "red") +
geom_segment(aes(yend = reorder(row.names(coeff_CI),desc(coeff_CI$pval))),
xend = coeff_CI$low_CI, color = "red") +
xlab("Coefficient with Confidence Interval") +
geom_vline(xintercept = 0, color = "darkolivegreen3") + # line gives context
theme_classic()
)# Reference: littlemissdata.com
# Clustering Price for Map Index Purpose
set.seed(1)
priceclustering <- house_data %>% select(selling_price)
# Build a kmeans model
model_km3 <- kmeans(priceclustering, centers = 5)
# Extract the cluster assignment vector from the kmeans model
clust_km3 <- model_km3$cluster
# Create a new dataframe appending the cluster assignment
price_cluster <- mutate(house_data, cluster = clust_km3)
price_index <- function(x){
for(i in 1:nrow(x)){
if(x[i,"cluster"] == 1){
x[i,"index"] = '75,000 - 409,500'
} else if(x[i,"cluster"] == 5) {
x[i,"index"] = '409,501 - 673,500'
} else if(x[i,"cluster"] == 3){
x[i,"index"] = '673,501 - 1,137,500'
} else if(x[i,"cluster"] == 2){
x[i,"index"] = '1,137,501 - 2,140,500'
} else {
x[i,"index"] = '2,140,501 - 7,700,000'
}
}
return(x)
}
price_map <- price_index(price_cluster)
price_map$index <- as.factor(price_map$index)
house1 <- filter(price_map, cluster == 1)
house2 <- filter(price_map, cluster == 2)
house3 <- filter(price_map, cluster == 3)
house4 <- filter(price_map, cluster == 4)
house5 <- filter(price_map, cluster == 5)
# Color Index
pal <- colorFactor(palette = c("blue", "red", "green", "pink", "orange"),
levels = c('75,000 - 409,500', '409,501 - 673,500', '673,501 - 1,137,500',
'1,137,501 - 2,140,500', '2,140,501 - 7,700,000'))
# Draw a Map
map_price_area <- leaflet(options = leafletOptions(minZoom = 9, dragging = TRUE)) %>%
addProviderTiles(provider = 'CartoDB')%>%
addCircleMarkers(data = house1, radius = 1,
popup = ~paste0("<b>", 'USD ', selling_price, "</b>", "<br/>", "House area (sqft): ",
sqft_living, "<br/>", "Lot area (sqft): ", sqft_lot),
color = ~pal(index), group = '75,000 - 409,500') %>%
addCircleMarkers(data = house5, radius = 1,
popup = ~paste0("<b>", 'USD ', selling_price, "</b>", "<br/>", "House area (sqft): ",
sqft_living, "<br/>", "Lot area (sqft): ", sqft_lot),
color = ~pal(index), group = '409,501 - 673,500') %>%
addCircleMarkers(data = house3, radius = 1,
popup = ~paste0("<b>", 'USD ', selling_price, "</b>", "<br/>", "House area (sqft): ",
sqft_living, "<br/>", "Lot area (sqft): ", sqft_lot),
color = ~pal(index), group = '673,501 - 1,137,500') %>%
addCircleMarkers(data = house2, radius = 1,
popup = ~paste0("<b>", 'USD ', selling_price, "</b>", "<br/>", "House area (sqft): ",
sqft_living, "<br/>", "Lot area (sqft): ", sqft_lot),
color = ~pal(index), group = '1,137,501 - 2,140,500') %>%
addCircleMarkers(data = house4, radius = 1,
popup = ~paste0("<b>", 'USD ', selling_price, "</b>", "<br/>", "House area (sqft): ",
sqft_living, "<br/>", "Lot area (sqft): ", sqft_lot),
color = ~pal(index), group = '2,140,501 - 7,700,000') %>%
setView(lng = -122.001008, lat = 47.474443, zoom = 9) %>%
addLegend(pal = pal,
values = c('75,000 - 409,500', '409,501 - 673,500', '673,501 - 1,137,500',
'1,137,501 - 2,140,500', '2,140,501 - 7,700,000'),
opacity = 0.5, title = "Price Range", position = "bottomright") %>%
addLayersControl(overlayGroups = c('75,000 - 409,500', '409,501 - 673,500', '673,501 - 1,137,500', '
1,137,501 - 2,140,500', '2,140,501 - 7,700,000'), position = "bottomleft")
map_price_area# How does house sales and revenue vary by zipcode
house_data %>%
group_by(zipcode) %>%
summarise(Count = n(),
Average_Price_per_sqft = mean(price_per_sqft),
Total_Revenue = sum(selling_price),
Average_Revenue = mean(selling_price)) %>%
arrange(desc(Average_Price_per_sqft, Count)) %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
row_spec(1, background = "#e5f5e0")| zipcode | Count | Average_Price_per_sqft | Total_Revenue | Average_Revenue |
|---|---|---|---|---|
| 98039 | 50 | 568.0834 | 108030330 | 2160606.6 |
| 98004 | 317 | 475.4356 | 429828885 | 1355927.1 |
| 98112 | 269 | 438.6379 | 294689323 | 1095499.3 |
| 98109 | 109 | 433.4012 | 95878975 | 879623.6 |
| 98119 | 184 | 432.2570 | 156298435 | 849448.0 |
| 98102 | 105 | 423.1725 | 94632118 | 901258.3 |
| 98105 | 229 | 405.1544 | 197586978 | 862825.2 |
| 98040 | 282 | 387.2925 | 336772866 | 1194230.0 |
| 98107 | 266 | 382.9520 | 154028209 | 579053.4 |
| 98199 | 317 | 376.5463 | 251007196 | 791820.8 |
| 98103 | 601 | 369.7990 | 351481365 | 584827.6 |
| 98122 | 290 | 367.5142 | 183964452 | 634360.2 |
| 98117 | 553 | 363.5355 | 318967639 | 576795.0 |
| 98115 | 583 | 354.1443 | 361402019 | 619900.5 |
| 98116 | 330 | 348.5510 | 204149276 | 618634.2 |
| 98033 | 432 | 343.1625 | 347206834 | 803719.5 |
| 98136 | 263 | 337.2180 | 145094121 | 551688.7 |
| 98005 | 168 | 314.9292 | 136107699 | 810164.9 |
| 98144 | 343 | 312.2419 | 203929844 | 594547.7 |
| 98008 | 283 | 301.7177 | 182678588 | 645507.4 |
| 98006 | 498 | 299.0915 | 428123020 | 859684.8 |
| 98177 | 255 | 292.9187 | 172427275 | 676185.4 |
| 98126 | 354 | 292.7959 | 150346050 | 424706.4 |
| 98007 | 141 | 290.0490 | 87011817 | 617105.1 |
| 98125 | 410 | 282.4717 | 192476866 | 469455.8 |
| 98070 | 118 | 281.3434 | 57522596 | 487479.6 |
| 98052 | 574 | 280.3863 | 370362856 | 645231.5 |
| 98029 | 321 | 272.0709 | 196661809 | 612653.6 |
| 98053 | 405 | 269.4718 | 274656039 | 678163.1 |
| 98075 | 359 | 268.6398 | 283817019 | 790576.7 |
| 98034 | 545 | 265.9522 | 284300808 | 521652.9 |
| 98074 | 441 | 265.6713 | 302352147 | 685605.8 |
| 98118 | 508 | 263.2555 | 212159816 | 417637.4 |
| 98024 | 81 | 255.2336 | 47022670 | 580526.8 |
| 98133 | 494 | 253.9806 | 191176714 | 386997.4 |
| 98027 | 412 | 251.5723 | 254200124 | 616990.6 |
| 98072 | 273 | 247.5120 | 155598661 | 569958.5 |
| 98155 | 446 | 246.5190 | 188981660 | 423725.7 |
| 98077 | 198 | 244.2784 | 135189426 | 682774.9 |
| 98106 | 335 | 231.3270 | 107059767 | 319581.4 |
| 98166 | 254 | 226.1971 | 117914887 | 464231.8 |
| 98011 | 195 | 225.9872 | 95618536 | 490351.5 |
| 98146 | 288 | 225.4914 | 103531173 | 359483.2 |
| 98028 | 283 | 225.1431 | 130881850 | 462480.0 |
| 98108 | 186 | 224.4433 | 66156204 | 355678.5 |
| 98014 | 124 | 223.0845 | 56496522 | 455617.1 |
| 98045 | 221 | 220.4645 | 97123109 | 439471.1 |
| 98056 | 406 | 215.5361 | 170881563 | 420890.5 |
| 98065 | 310 | 210.8860 | 163667973 | 527961.2 |
| 98010 | 100 | 210.0954 | 42366599 | 423666.0 |
| 98059 | 468 | 207.2423 | 230982585 | 493552.5 |
| 98019 | 190 | 203.0017 | 80709862 | 424788.7 |
| 98178 | 262 | 189.1725 | 81380542 | 310612.8 |
| 98148 | 57 | 185.8321 | 16239790 | 284908.6 |
| 98022 | 234 | 181.8064 | 73875977 | 315709.3 |
| 98055 | 268 | 180.4025 | 81542245 | 304262.1 |
| 98198 | 280 | 178.4286 | 84806087 | 302878.9 |
| 98058 | 455 | 178.2044 | 160891929 | 353608.6 |
| 98168 | 269 | 175.3725 | 64648332 | 240328.4 |
| 98038 | 590 | 173.6562 | 216451884 | 366867.6 |
| 98188 | 136 | 169.0073 | 39314655 | 289078.3 |
| 98042 | 548 | 164.3485 | 170774394 | 311632.1 |
| 98031 | 274 | 161.0404 | 82347930 | 300539.9 |
| 98003 | 280 | 157.1134 | 82351158 | 294111.3 |
| 98092 | 351 | 155.8142 | 117557293 | 334921.1 |
| 98030 | 256 | 155.1562 | 75824123 | 296188.0 |
| 98032 | 125 | 154.2195 | 31412030 | 251296.2 |
| 98001 | 362 | 151.3879 | 101651298 | 280804.7 |
| 98002 | 199 | 151.1741 | 46622523 | 234284.0 |
| 98023 | 499 | 148.9198 | 143079663 | 286732.8 |
# How does average price per sqft vary by zipcode
zipcode_avg_price_per_sqft <- house_data %>%
group_by(zipcode) %>%
summarise(Count = n(),
Average_Price_per_sqft = mean(price_per_sqft),
Total_Revenue = sum(selling_price)) %>%
arrange(desc(Average_Price_per_sqft, Count, bedrooms)) %>%
slice(1:10) %>%
ggplot(aes(x = reorder(zipcode, -Average_Price_per_sqft), y = Average_Price_per_sqft)) +
geom_bar(stat="identity", fill = "darkolivegreen3") +
theme_classic() +
theme(text = element_text(size = 10))+
theme(axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
legend.position = "none",
panel.background = element_blank()) +
theme(plot.title = element_text(hjust = 0, face = 'bold',color = 'black'),
plot.subtitle = element_text(face = "italic")) +
labs(x = "Zipcode",
y = "Average Price per sqft ($)",
title = "Top 10 most expensive areas in King County",
subtitle = "") +
geom_text(aes(label = str_c('$',round(Average_Price_per_sqft, 0))), size = 3, vjust = 1.5, position = position_dodge(0.9))
zipcode_avg_price_per_sqft# How does number of houses sold vary by zipcode
house_data %>%
group_by(zipcode) %>%
summarise(Count = n(),
Average_Price_per_sqft = mean(price_per_sqft),
Total_Revenue = sum(selling_price)) %>%
arrange(desc(Average_Price_per_sqft, Count, bedrooms)) %>%
slice(1:10) %>%
ggplot(aes(x = reorder(zipcode, -Count), y = Count)) +
geom_bar(stat="identity", fill = "darkolivegreen3") +
theme_classic() +
theme(text = element_text(size = 10))+
theme(axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
legend.position = "none",
panel.background = element_blank()) +
theme(plot.title = element_text(hjust = 0, face = 'bold',color = 'black'),
plot.subtitle = element_text(face = "italic")) +
labs(x = "Zipcode",
y = "Number of sales",
title = "Top 10 zipcodes with most number of houses sold",
subtitle = "") +
geom_text(aes(label = str_c(round(Count, 0))), size = 3, vjust = 1.5, position = position_dodge(0.9))# how does selling price of the house vary by zipcode
house_data %>%
group_by(zipcode) %>%
summarise(Count = n(),
Average_Price_per_sqft = mean(price_per_sqft),
Total_Revenue = sum(selling_price)) %>%
arrange(desc(Average_Price_per_sqft, Count, bedrooms)) %>%
slice(1:10) %>%
ggplot(aes(x = reorder(zipcode, -Total_Revenue), y = Total_Revenue/1000000)) +
geom_bar(stat="identity", fill = "darkolivegreen3") +
theme_classic() +
theme(text = element_text(size = 10))+
theme(axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
legend.position = "none",
panel.background = element_blank()) +
theme(plot.title = element_text(hjust = 0, face = 'bold',color = 'black'),
plot.subtitle = element_text(face = "italic")) +
labs(x = "Zipcode",
y = "Total revenue (in mil $)",
title = "Top 10 zipcodes with maximum revenue",
subtitle = "") +
geom_text(aes(label = str_c(round(Total_Revenue/1000000, 0), 'mil $')), size = 3, vjust = 1.5, position = position_dodge(0.9))# how does sales vary by year built
house_data %>%
group_by(yr_built) %>%
summarise(n = n()) %>%
ggplot(aes(x = yr_built, y = n)) +
geom_line(color = 'darkolivegreen3') +
geom_smooth(method="lm", color = 'black', size = 0.5, se = FALSE) +
theme_classic() +
theme(plot.title = element_text(hjust = 0, face = 'bold',color = 'black'),
plot.subtitle = element_text(face = "italic")) +
labs(x = 'Year Built', y = 'Total', title = "Includes number of houses built in 1900 - 2015 in King County",
subtitle = "Sales are more for the houses which are recently built") +
scale_x_continuous(breaks=seq(1900, 2015, 10))# how does sales vary by date sold
house_sales_by_date <- house_data %>%
group_by(date_sold) %>%
summarise(n = n()) %>%
ggplot(aes(x = date_sold, y = n)) +
geom_line(color = 'darkolivegreen3') +
geom_smooth(method="lm", color = 'black', size = 0.5, se = FALSE) +
theme_classic() +
theme(plot.title = element_text(hjust = 0, face = 'bold', color = 'black'),
plot.subtitle = element_text(face = "italic")) +
labs(x = 'Date Sold', y = 'Number of houses', title = "Overall sales have dropped between May 2014 - May 2015",
subtitle = "")
house_sales_by_date # how does sales vary by date sold
house_data %>%
group_by(date_sold) %>%
summarise(n = n(),
total_revenue = sum(selling_price)) %>%
ggplot(aes(x = date_sold, y = total_revenue/1000000)) +
geom_line(color = 'darkolivegreen3') +
geom_smooth(method="lm", color = 'black', size = 0.5, se = FALSE) +
theme_classic() +
theme(plot.title = element_text(hjust = 0, face = 'bold',color = 'black'),
plot.subtitle = element_text(face = "italic")) +
labs(x = 'Date Sold', y = 'Total Revenue (in mil $)', title = "Includes daily revenue of sales between May 2014 - May 2015",
subtitle = "There is slight decrease in revenue between May 2014 and May 2015")# how does sales vary by grade and condition
house_data %>%
group_by(grade, condition) %>%
summarize (Revenue_sum = sum(selling_price)) %>%
ggplot(aes(x = reorder(grade, -Revenue_sum), y = Revenue_sum/1000000, fill = condition)) +
geom_bar(stat="identity") +
theme_classic() +
theme(text = element_text(size = 10))+
theme(plot.title = element_text(hjust = 0, face = 'bold',color = 'black'),
plot.subtitle = element_text(face = "italic")) +
labs(x = "Grades",
y = "Total Revenue (in mil USD)",
title = "Houses with Grade 7, on King County scale, have maximum revenue",
subtitle = "Houses with condition 3 have maximum revenue among all grades.",
caption = "There are 13 grades and 5 conditions") +
scale_fill_manual(values=c("darkolivegreen4", "yellow4","darkolivegreen3", "darkolivegreen2", "darkolivegreen1") )# how does revenue vary by condition and living area
ggplot(house_data, aes(x = selling_price/1000, y = sqft_living, color = condition)) +
geom_point(alpha = 0.5) +
geom_smooth(method=lm, se=FALSE, color="black") +
theme_classic()+
theme(legend.title = element_text(size=10),
plot.title = element_text(hjust = 0, face = 'bold',color = 'black'),
plot.subtitle = element_text(face = "italic")) +
labs(x = 'Sold At (,000 $)', y = 'Living Area (sq.ft)', title = "Price increases as living area increases.",
subtitle = "More sales are for lower area and condition 3 houses.") +
guides(color = guide_legend(title = 'Condition')) +
scale_color_brewer(palette="Paired")# how does revenue vary by grade and living area
ggplot(house_data, aes(x = selling_price/1000, y = sqft_living, color = grade)) +
geom_point(alpha = 0.5) +
geom_smooth(method=lm, se=FALSE, color="black") +
theme_classic()+
theme(legend.title = element_text(size=10),
plot.title = element_text(hjust = 0, face = 'bold',color = 'black'),
plot.subtitle = element_text(face = "italic")) +
labs(x = 'Sold At (,000 $)', y = 'Living Area (sq.ft)', title = "Higher prices houses have higher grades.",
subtitle = "More revenue and sales are for houses between grade 7 and 13.") +
guides(color = guide_legend(title = 'Grade')) +
scale_color_brewer(palette="Paired")# Analysis of house sales and revenue by number of bedrooms
house_data %>%
group_by(bedrooms) %>%
summarise(Count = n(),
Average_Price_per_sqft = mean(price_per_sqft),
Total_Revenue = sum(selling_price),
Average_Revenue = mean(selling_price)) %>%
arrange(desc(Average_Price_per_sqft, Count)) %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
row_spec(1, background = "#e5f5e0")| bedrooms | Count | Average_Price_per_sqft | Total_Revenue | Average_Revenue |
|---|---|---|---|---|
| 1 | 199 | 384.7627 | 63210934 | 317642.9 |
| 2 | 2760 | 332.8027 | 1107788602 | 401372.7 |
| 0 | 13 | 293.6016 | 5323550 | 409503.8 |
| 8 | 13 | 268.1689 | 14366000 | 1105076.9 |
| 3 | 9824 | 260.0878 | 4580263939 | 466232.1 |
| 5 | 1601 | 247.5242 | 1259346326 | 786599.8 |
| 4 | 6882 | 244.0516 | 4372957028 | 635419.5 |
| 9 | 6 | 240.4845 | 5363999 | 893999.8 |
| 6 | 272 | 236.6283 | 224541613 | 825520.6 |
| 7 | 38 | 232.3056 | 36145017 | 951184.7 |
| 10 | 3 | 218.7306 | 2458000 | 819333.3 |
| 11 | 1 | 173.3333 | 520000 | 520000.0 |
# Analysis of house sales and revenue by number of bathrooms
house_data %>%
group_by(bathrooms) %>%
summarise(Count = n(),
Average_Price_per_sqft = mean(price_per_sqft),
Total_Revenue = sum(selling_price),
Average_revenue = mean(selling_price)) %>%
arrange(desc(Average_Price_per_sqft, Count)) %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
row_spec(1, background = "#e5f5e0")| bathrooms | Count | Average_Price_per_sqft | Total_Revenue | Average_revenue |
|---|---|---|---|---|
| 7.75 | 1 | 696.1577 | 6885000 | 6885000.0 |
| 6 | 6 | 463.8275 | 17681000 | 2946833.3 |
| 8 | 2 | 403.6971 | 9980000 | 4990000.0 |
| 1.25 | 9 | 403.3049 | 5590950 | 621216.7 |
| 5.5 | 10 | 381.3343 | 25223000 | 2522300.0 |
| 4.75 | 23 | 379.1843 | 46489990 | 2021303.9 |
| 6.25 | 2 | 372.2870 | 6188000 | 3094000.0 |
| 0.75 | 72 | 359.9760 | 21205502 | 294520.9 |
| 5.25 | 13 | 346.4811 | 23623300 | 1817176.9 |
| 4.25 | 79 | 329.1272 | 120536180 | 1525774.4 |
| 5 | 21 | 319.9156 | 35143330 | 1673491.9 |
| 5.75 | 4 | 317.1154 | 9970000 | 2492500.0 |
| 3.75 | 155 | 316.9834 | 185568620 | 1197216.9 |
| 0 | 10 | 312.4947 | 4481600 | 448160.0 |
| 4 | 136 | 307.8692 | 172408467 | 1267709.3 |
| 1 | 3852 | 305.1286 | 1336802838 | 347041.2 |
| 3.25 | 589 | 300.5422 | 571530131 | 970339.8 |
| 6.75 | 2 | 295.5921 | 5468000 | 2734000.0 |
| 4.5 | 100 | 293.1957 | 133362021 | 1333620.2 |
| 3.5 | 731 | 272.4028 | 681243129 | 931933.1 |
| 1.5 | 1446 | 271.4661 | 591879911 | 409322.2 |
| 2 | 1930 | 259.0156 | 883727157 | 457889.7 |
| 1.75 | 3047 | 258.9230 | 1385883303 | 454835.3 |
| 3 | 753 | 256.6291 | 533436670 | 708415.2 |
| 2.25 | 2047 | 254.5052 | 1092436356 | 533676.8 |
| 6.5 | 2 | 248.2306 | 3418890 | 1709445.0 |
| 2.75 | 1185 | 247.5223 | 782372969 | 660230.4 |
| 2.5 | 5380 | 233.7021 | 2978349194 | 553596.5 |
| 0.5 | 4 | 211.2323 | 949500 | 237375.0 |
| 7.5 | 1 | 111.1111 | 450000 | 450000.0 |
# Analysis of house sales and revenue by number of floors
house_data %>%
group_by(floors) %>%
summarise(Count = n(),
Average_Price_per_sqft = mean(price_per_sqft),
Total_Revenue = sum(selling_price),
Average_revenue = mean(selling_price)) %>%
arrange(desc(Average_Price_per_sqft, Count)) %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
row_spec(1, background = "#e5f5e0")| floors | Count | Average_Price_per_sqft | Total_Revenue | Average_revenue |
|---|---|---|---|---|
| 3.5 | 8 | 347.5220 | 7466500 | 933312.5 |
| 3 | 613 | 328.1688 | 357088462 | 582526.0 |
| 2.5 | 161 | 320.5230 | 170715785 | 1060346.5 |
| 1.5 | 1910 | 299.3680 | 1067653028 | 558980.6 |
| 1 | 10679 | 264.7266 | 4721849156 | 442162.1 |
| 2 | 8241 | 249.2975 | 5347512077 | 648891.2 |
# Analysis of house sales and revenue by waterfront view
house_data %>%
group_by(waterfront) %>%
summarise(Count = n(),
Average_Price_per_sqft = mean(price_per_sqft),
Total_Revenue = sum(selling_price),
Average_Revenue = mean(selling_price)) %>%
arrange(desc(Average_Price_per_sqft, Count)) %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
row_spec(1, background = "#e5f5e0")| waterfront | Count | Average_Price_per_sqft | Total_Revenue | Average_Revenue |
|---|---|---|---|---|
| 1 | 163 | 508.0964 | 270885792 | 1661876.0 |
| 0 | 21449 | 262.2967 | 11401399216 | 531558.5 |
# Analysis of house sales and revenue by houses renovated
house_data %>%
group_by(renovated) %>%
summarise(Count = n(),
Average_Price_per_sqft = mean(price_per_sqft),
Total_Revenue = sum(selling_price),
Average_Revenue = mean(selling_price)) %>%
arrange(desc(Average_Price_per_sqft, Count)) %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
row_spec(1, background = "#e5f5e0")| renovated | Count | Average_Price_per_sqft | Total_Revenue | Average_Revenue |
|---|---|---|---|---|
| 1 | 914 | 319.3024 | 694986433 | 760379.0 |
| 0 | 20698 | 261.7151 | 10977298575 | 530355.5 |
ggplot(house_data, aes(x = lat, y = long, color = condition)) +
geom_point(alpha = 0.5) +
geom_smooth(method=lm, se=FALSE, color="black") +
theme_classic()+
theme(legend.title = element_text(size=10),
plot.title = element_text(hjust = 0, face = 'bold',color = 'black'),
plot.subtitle = element_text(face = "italic")) +
labs(x = 'latitude', y = 'longitude', title = "Sales remain almost same across latitude and longitude",
subtitle = "Comparatively fewer sales in higher latitude and lower longitude") +
guides(color = guide_legend(title = 'Condition')) +
scale_color_brewer(palette="Paired")# how does revenue vary by condition and living area
ggplot(house_data, aes(x = selling_price/1000, y = sqft_living, color = bedrooms)) +
geom_point(alpha = 0.5) +
geom_smooth(method=lm, se=FALSE, color="black") +
theme_classic()+
theme(legend.title = element_text(size=10),
plot.title = element_text(hjust = 0, face = 'bold',color = 'black'),
plot.subtitle = element_text(face = "italic")) +
labs(x = 'Sold At (,000 $)', y = 'Living Area (sq.ft)', title = "Price increases as living area increases.",
subtitle = "More sales are for 3 to 7 bedroom houses .") +
guides(color = guide_legend(title = 'Bedrooms')) +
scale_color_brewer(palette="Paired")